diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index a401b74..4abafff 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -470,37 +470,66 @@ def compare_terminal_and_txt(txt_file_path, terminal_output): def compare_python_pure_text(py_file_path, gold_file_path): if not py_file_path or not gold_file_path: - return 0 + return 0.0 - # first, change the suffix of gold_file from .txt to .py - print("py_file_path: ") - print(py_file_path) - print("gold_file_path: ") - print(gold_file_path) + def _normalize(text): + """ + Minimal normalization - only handle basic formatting: + - Skip obvious file metadata (encoding, shebang) at the beginning + - Normalize whitespace and indentation + - Remove empty lines + + This preserves any content that shouldn't be there (like markdown) + so it can be detected as an error. + """ + lines = text.splitlines() + result_lines = [] + i = 0 + + # Only skip obvious metadata at the very beginning + while i < len(lines) and i < 3: # Check only first 3 lines + stripped = lines[i].strip() + + if (stripped.startswith('#!') or + stripped.startswith('# -*- coding:') or + stripped.startswith('# coding:') or + stripped.startswith('# coding=')): + i += 1 + continue + + break + + # Process all remaining lines with minimal filtering + while i < len(lines): + line = lines[i] + stripped = line.strip() + + if stripped: # Keep all non-empty lines + normalized = line.expandtabs(4).rstrip() + result_lines.append(normalized) + + i += 1 + + return '\n'.join(result_lines) - # gold_file_path = gold_file_path.replace('.txt', '.py') - def remove_whitespace(text): - return ''.join(text.split()) - - with open(py_file_path, 'r') as file1: - content1 = file1.read() - with open(gold_file_path, 'r') as file2: - content2 = file2.read() - content1_no_whitespace = remove_whitespace(content1) - content2_no_whitespace = remove_whitespace(content2) - if content1_no_whitespace == content2_no_whitespace: - return 1 - else: - return 0 - -if __name__ == '__main__': - print(check_direct_json_object([], rules={ - "relativeTime": { - "from": "5th next month" - }, - "expected": { - "start": "SEA", - "end": "NYC", - "time": "{DoW}, {Month} {DayD}, {Year}", - "category": "Miles" - }})) + try: + with open(py_file_path, 'r', encoding='utf-8') as file1: + user_content = file1.read() + with open(gold_file_path, 'r', encoding='utf-8') as file2: + gold_content = file2.read() + + # Apply different normalization strategies + user_normalized = _normalize(user_content) + gold_normalized = _normalize(gold_content) + + if user_normalized == gold_normalized: + return 1.0 + else: + return 0.0 + + except (FileNotFoundError, IOError, UnicodeDecodeError) as e: + logger.debug(f"compare_python_pure_text: Error reading files - {e}") + return 0.0 + except Exception as e: + logger.debug(f"compare_python_pure_text: Unexpected error - {e}") + return 0.0 diff --git a/evaluation_examples/examples/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0.json b/evaluation_examples/examples/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0.json index 9066531..1ed021c 100644 --- a/evaluation_examples/examples/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0.json +++ b/evaluation_examples/examples/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0.json @@ -1,7 +1,7 @@ { "id": "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0", "snapshot": "multiapps", - "instruction": "I ran some simple code demos on the currently open google colab, and I think the effect is pretty good. Please help me extract the code in all code boxes, merge it into a \"task.py\" file and store it in the local Home directory.", + "instruction": "Please extract all Python code and comments from Karpathy's GPT colab code cells (skip markdown parts), merge into \"gpt_dev_pure_code.py\" in Home directory. Include all Python code and # comments from code cells, but exclude markdown docstrings and file headers.", "source": "authors", "config": [ { @@ -27,7 +27,7 @@ "type": "chrome_open_tabs", "parameters": { "urls_to_open": [ - "https://colab.research.google.com/github/stanfordnlp/dspy/blob/main/intro.ipynb#scrollTo=L1BHAoL_LRd7" + "https://colab.research.google.com/drive/1JMLa53HDuA-i7ZBmqV7ZnA3c_fvtXnx-?usp=sharing#scrollTo=h5hjCcLDr2WC" ] } } @@ -42,14 +42,15 @@ "func": "compare_python_pure_text", "result": { "type": "vm_file", - "path": "/home/user/colab.py", - "dest": "colab.py" + "path": "/home/user/gpt_dev_pure_code.py", + "dest": "gpt_dev_pure_code.py" }, "expected": { "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0/Colab%20Gold.py", - "dest": "colab_Gold.py" + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/dd60633f-2c72-42ba-8547-6f2c8cb0fdb0/gpt_dev_pure_code_gold.py", + "dest": "gpt_dev_pure_code_gold.py" } }, - "proxy": true + "proxy": true, + "env_may_change": true } \ No newline at end of file