From 61f265a0821d5fb6e9ba9bbe5e3c24b1823d439a Mon Sep 17 00:00:00 2001 From: st2rb8g <115790381+st2rb8g@users.noreply.github.com> Date: Fri, 11 Jul 2025 06:32:13 +0800 Subject: [PATCH] fix some multi_apps tasks (#245) * fix chrome * fix some multi_apps tasks. * fix some multiapps tasks * fix some multiapps tasks --------- Co-authored-by: yuanmengqi --- desktop_env/evaluators/metrics/chrome.py | 20 ++++++++++++++++++- desktop_env/evaluators/metrics/general.py | 1 - .../ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json | 17 ++++++++-------- .../da922383-bfa4-4cd3-bbad-6bebab3d7742.json | 2 +- .../e135df7c-7687-4ac0-a5f0-76b74438b53e.json | 7 +++++++ .../e8172110-ec08-421b-a6f5-842e6451911f.json | 4 ++-- .../f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json | 6 +++--- main.py | 2 +- 8 files changed, 42 insertions(+), 17 deletions(-) diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 1073b3d..632c53e 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -115,6 +115,11 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f if match_type == "url": expected_urls = rule['urls'] actual_urls = [tab['url'] for tab in open_tabs] + if not are_lists_equal(expected_urls, actual_urls, compare_urls): + logger.error("list not match") + logger.error(expected_urls) + logger.error(actual_urls) + return 0 return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0 else: logger.error(f"Unknown type: {match_type}") @@ -343,7 +348,7 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float: return score / len(pred_files) -def compare_htmls(html_path1: str, html_path2: str) -> float: +def compare_htmls(html_path1: str, html_path2: str, **options) -> float: """ Compare two HTML files. """ @@ -351,20 +356,33 @@ def compare_htmls(html_path1: str, html_path2: str) -> float: soup1 = BeautifulSoup(inf, 'lxml') with open(html_path2, 'r', encoding='utf-8') as inf: soup2 = BeautifulSoup(inf, 'lxml') + ignore_sdnum = options.get("ignore_sdnum", None) def compare_elements(elem1, elem2): if not (isinstance(elem1, Tag) and isinstance(elem2, Tag)): + if elem1 != elem2: + logger.info("not the same") return elem1 == elem2 if elem1.name != elem2.name: + logger.info("html name not match") return False if elem1.text.strip() != elem2.text.strip(): + logger.info("html text not match") return False if elem1.attrs != elem2.attrs: + if ignore_sdnum: + attrs1 = {k: v for k, v in elem1.attrs.items() if k != 'sdnum'} + attrs2 = {k: v for k, v in elem2.attrs.items() if k != 'sdnum'} + return attrs1 == attrs2 + logger.info("html attrs not match") + logger.info(f"{elem1.attrs}") + logger.info(f"{elem2.attrs}") return False return True for elem1, elem2 in zip(soup1.recursiveChildGenerator(), soup2.recursiveChildGenerator()): if not compare_elements(elem1, elem2): + logger.info("html not match") return .0 return 1. diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py index 2380b35..6c8c1f9 100644 --- a/desktop_env/evaluators/metrics/general.py +++ b/desktop_env/evaluators/metrics/general.py @@ -213,7 +213,6 @@ _accessibility_ns_map = { } - def check_accessibility_tree(result: str, rules: List[Dict[str, Any]], osname: str = "ubuntu") -> float: """ Args: diff --git a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json index a5ffa57..d6f5ac2 100644 --- a/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json +++ b/evaluation_examples/examples/multi_apps/ce2b64a2-ddc1-4f91-8c7d-a88be7121aac.json @@ -1,7 +1,7 @@ { "id": "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac", "snapshot": "multiapps", - "instruction": "There are several pictures of mountains in my Pictures directory, but I don’t know the names of these mountains. Please help me identify these pictures and change the names of these pictures to the names of the mountains in the pictures.", + "instruction": "There are several pictures of mountains in my Pictures directory, but I don’t know the names of these mountains. Please help me identify these pictures and change the names of these pictures to the names of the mountains in the pictures (You don't need to add \"Mount\" before the name of the mountain).", "source": "authors", "config": [ { @@ -69,19 +69,20 @@ "rules": { "expected": { "ec076282f61ba74642e94b5a6a1250c6988204d59d9b02936606b6b8ef1e4433": [ - "Kilimanjaro" + "Kilimanjaro.jpg" ], "6ed4239ecc2be3ec15ad65a78c5c823b9004d640b8cc83a6a7af5930f354de91": [ - "Himalayas", - "Everest", - "Sagarmatha" + "Himalayas.jpg", + "Everest.jpg", + "Sagarmatha.jpg" ], "79f45d40d8413d4e81f1b9734ea39e58622cafd79e12bab32959643fc245147c": [ - "Hua", - "hua" + "Hua.jpg", + "hua.jpg" ] }, - "expect_in_result": true + "expect_in_result": true, + "result_not_list": true } } }, diff --git a/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json b/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json index 5f0e4cd..2cf05b5 100644 --- a/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json +++ b/evaluation_examples/examples/multi_apps/da922383-bfa4-4cd3-bbad-6bebab3d7742.json @@ -1,7 +1,7 @@ { "id": "da922383-bfa4-4cd3-bbad-6bebab3d7742", "snapshot": "multiapps", - "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their tile to /home/user/Documents/Blog.", + "instruction": "I browsed a lot of interesting blog articles today. I hope to store these articles in my local designated folder just like zotero stores papers. Please download the blogs opening now in pdf format and save them in their title to /home/user/Documents/Blog.", "source": "authors", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json b/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json index c90efa4..15d42f1 100644 --- a/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json +++ b/evaluation_examples/examples/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e.json @@ -89,7 +89,14 @@ "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e135df7c-7687-4ac0-a5f0-76b74438b53e/annual-enterprise-survey-2021-financial-year-provisional.html", "dest": "annual-enterprise-survey-2021-financial-year-provisional_gold.html" } + ], + "options": [ + {}, + { + "ignore_sdnum": true + } ] }, + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json index df6ead8..8506dda 100644 --- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json +++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json @@ -36,8 +36,8 @@ ], "evaluator": { "func": [ - "check_structure_sim", - "check_structure_sim" + "check_structure_sim_resized", + "check_structure_sim_resized" ], "result": [ { diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json index 3ad9724..7dd4f83 100644 --- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json +++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json @@ -10,7 +10,7 @@ "files": [ { "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2/file.xls", - "path": "/home/user/cell_search.xlsx" + "path": "/home/user/cell_search.xls" } ] } @@ -47,7 +47,7 @@ { "type": "open", "parameters": { - "path": "/home/user/cell_search.xlsx" + "path": "/home/user/cell_search.xls" } } ], @@ -65,7 +65,7 @@ "type": "rule", "rules": { "expect": { - "pattern": "www\\.google\\.com.*?/search\\?q=Nereida&" + "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)" } } } diff --git a/main.py b/main.py index 6f03227..47bed52 100644 --- a/main.py +++ b/main.py @@ -83,4 +83,4 @@ def human_agent(): if __name__ == "__main__": - human_agent() + human_agent() \ No newline at end of file