Refactor compare_python_pure_text function for improved normalization and error handling. Update JSON example to clarify instruction for extracting Python code from Colab, changing output file names for consistency.
This commit is contained in:
@@ -470,37 +470,66 @@ def compare_terminal_and_txt(txt_file_path, terminal_output):
|
||||
|
||||
def compare_python_pure_text(py_file_path, gold_file_path):
|
||||
if not py_file_path or not gold_file_path:
|
||||
return 0
|
||||
return 0.0
|
||||
|
||||
# first, change the suffix of gold_file from .txt to .py
|
||||
print("py_file_path: ")
|
||||
print(py_file_path)
|
||||
print("gold_file_path: ")
|
||||
print(gold_file_path)
|
||||
def _normalize(text):
|
||||
"""
|
||||
Minimal normalization - only handle basic formatting:
|
||||
- Skip obvious file metadata (encoding, shebang) at the beginning
|
||||
- Normalize whitespace and indentation
|
||||
- Remove empty lines
|
||||
|
||||
This preserves any content that shouldn't be there (like markdown)
|
||||
so it can be detected as an error.
|
||||
"""
|
||||
lines = text.splitlines()
|
||||
result_lines = []
|
||||
i = 0
|
||||
|
||||
# Only skip obvious metadata at the very beginning
|
||||
while i < len(lines) and i < 3: # Check only first 3 lines
|
||||
stripped = lines[i].strip()
|
||||
|
||||
if (stripped.startswith('#!') or
|
||||
stripped.startswith('# -*- coding:') or
|
||||
stripped.startswith('# coding:') or
|
||||
stripped.startswith('# coding=')):
|
||||
i += 1
|
||||
continue
|
||||
|
||||
break
|
||||
|
||||
# Process all remaining lines with minimal filtering
|
||||
while i < len(lines):
|
||||
line = lines[i]
|
||||
stripped = line.strip()
|
||||
|
||||
if stripped: # Keep all non-empty lines
|
||||
normalized = line.expandtabs(4).rstrip()
|
||||
result_lines.append(normalized)
|
||||
|
||||
i += 1
|
||||
|
||||
return '\n'.join(result_lines)
|
||||
|
||||
# gold_file_path = gold_file_path.replace('.txt', '.py')
|
||||
def remove_whitespace(text):
|
||||
return ''.join(text.split())
|
||||
|
||||
with open(py_file_path, 'r') as file1:
|
||||
content1 = file1.read()
|
||||
with open(gold_file_path, 'r') as file2:
|
||||
content2 = file2.read()
|
||||
content1_no_whitespace = remove_whitespace(content1)
|
||||
content2_no_whitespace = remove_whitespace(content2)
|
||||
if content1_no_whitespace == content2_no_whitespace:
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
if __name__ == '__main__':
|
||||
print(check_direct_json_object([], rules={
|
||||
"relativeTime": {
|
||||
"from": "5th next month"
|
||||
},
|
||||
"expected": {
|
||||
"start": "SEA",
|
||||
"end": "NYC",
|
||||
"time": "{DoW}, {Month} {DayD}, {Year}",
|
||||
"category": "Miles"
|
||||
}}))
|
||||
try:
|
||||
with open(py_file_path, 'r', encoding='utf-8') as file1:
|
||||
user_content = file1.read()
|
||||
with open(gold_file_path, 'r', encoding='utf-8') as file2:
|
||||
gold_content = file2.read()
|
||||
|
||||
# Apply different normalization strategies
|
||||
user_normalized = _normalize(user_content)
|
||||
gold_normalized = _normalize(gold_content)
|
||||
|
||||
if user_normalized == gold_normalized:
|
||||
return 1.0
|
||||
else:
|
||||
return 0.0
|
||||
|
||||
except (FileNotFoundError, IOError, UnicodeDecodeError) as e:
|
||||
logger.debug(f"compare_python_pure_text: Error reading files - {e}")
|
||||
return 0.0
|
||||
except Exception as e:
|
||||
logger.debug(f"compare_python_pure_text: Unexpected error - {e}")
|
||||
return 0.0
|
||||
|
||||
Reference in New Issue
Block a user