diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 1073b3d..632c53e 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -115,6 +115,11 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f if match_type == "url": expected_urls = rule['urls'] actual_urls = [tab['url'] for tab in open_tabs] + if not are_lists_equal(expected_urls, actual_urls, compare_urls): + logger.error("list not match") + logger.error(expected_urls) + logger.error(actual_urls) + return 0 return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0 else: logger.error(f"Unknown type: {match_type}") @@ -343,7 +348,7 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float: return score / len(pred_files) -def compare_htmls(html_path1: str, html_path2: str) -> float: +def compare_htmls(html_path1: str, html_path2: str, **options) -> float: """ Compare two HTML files. """ @@ -351,20 +356,33 @@ def compare_htmls(html_path1: str, html_path2: str) -> float: soup1 = BeautifulSoup(inf, 'lxml') with open(html_path2, 'r', encoding='utf-8') as inf: soup2 = BeautifulSoup(inf, 'lxml') + ignore_sdnum = options.get("ignore_sdnum", None) def compare_elements(elem1, elem2): if not (isinstance(elem1, Tag) and isinstance(elem2, Tag)): + if elem1 != elem2: + logger.info("not the same") return elem1 == elem2 if elem1.name != elem2.name: + logger.info("html name not match") return False if elem1.text.strip() != elem2.text.strip(): + logger.info("html text not match") return False if elem1.attrs != elem2.attrs: + if ignore_sdnum: + attrs1 = {k: v for k, v in elem1.attrs.items() if k != 'sdnum'} + attrs2 = {k: v for k, v in elem2.attrs.items() if k != 'sdnum'} + return attrs1 == attrs2 + logger.info("html attrs not match") + logger.info(f"{elem1.attrs}") + logger.info(f"{elem2.attrs}") return False return True for elem1, elem2 in zip(soup1.recursiveChildGenerator(), soup2.recursiveChildGenerator()): if not compare_elements(elem1, elem2): + logger.info("html not match") return .0 return 1. diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index 03e66a4..d0f6195 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -213,7 +213,6 @@ _accessibility_ns_map = { } - def check_accessibility_tree(result: str, rules: List[Dict[str, Any]], osname: str = "ubuntu") -> float: """ Args: diff --git a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json index 21e6887..9e60e48 100644 --- a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json +++ b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json @@ -163,7 +163,8 @@ "hua shan mountain.jpg" ] }, - "expect_in_result": true + "expect_in_result": true, + "result_not_list": true } } }, diff --git a/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json b/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json index 5f0e4cd..2cf05b5 100644 --- a/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json +++ b/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json @@ -1,7 +1,7 @@ { "id": "da922383-bfa4-4cd3-bbad-6bebab3d7742", "snapshot": "multiapps", - "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their tile to /home/user/Documents/Blog.", + "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their title to /home/user/Documents/Blog.", "source": "authors", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json b/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json index c90efa4..15d42f1 100644 --- a/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json +++ b/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json @@ -89,7 +89,14 @@ "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e/annual-enterprise-survey-2021-financial-year-provisional.html", "dest": "annual-enterprise-survey-2021-financial-year-provisional_gold.html" } + ], + "options": [ + {}, + { + "ignore_sdnum": true + } ] }, + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json index df6ead8..8506dda 100644 --- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json +++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json @@ -36,8 +36,8 @@ ], "evaluator": { "func": [ - "check_structure_sim", - "check_structure_sim" + "check_structure_sim_resized", + "check_structure_sim_resized" ], "result": [ { diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json index 3ad9724..7dd4f83 100644 --- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json +++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json @@ -10,7 +10,7 @@ "files": [ { "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2/file.xls", - "path": "/home/user/cell_search.xlsx" + "path": "/home/user/cell_search.xls" } ] } @@ -47,7 +47,7 @@ { "type": "open", "parameters": { - "path": "/home/user/cell_search.xlsx" + "path": "/home/user/cell_search.xls" } } ], @@ -65,7 +65,7 @@ "type": "rule", "rules": { "expect": { - "pattern": "www\\.google\\.com.*?/search\\?q=Nereida&" + "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)" } } } diff --git a/main.py b/main.py index 6f03227..47bed52 100644 --- a/main.py +++ b/main.py @@ -83,4 +83,4 @@ def human_agent(): if __name__ == "__main__": - human_agent() + human_agent() \ No newline at end of file