From 572a94b6dff0169fb005f9741cf475e7e63addb9 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sun, 13 Jul 2025 10:16:08 +0000 Subject: [PATCH] Merge branch 'main' into fix_chrome --- desktop_env/desktop_env.py | 2 +- desktop_env/evaluators/getters/chrome.py | 90 +++++++++++++++++-- desktop_env/evaluators/metrics/chrome.py | 24 +++-- desktop_env/evaluators/metrics/gimp.py | 5 +- desktop_env/evaluators/metrics/table.py | 63 ++++++++++--- .../2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json | 2 +- .../3680a5ee-6870-426a-a997-eba929a0d25c.json | 15 +++- .../42d25c08-fb87-4927-8b65-93631280a26f.json | 51 ++++++++--- .../a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json | 14 ++- .../e8172110-ec08-421b-a6f5-842e6451911f.json | 16 ++-- .../f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json | 2 +- evaluation_examples/test_fix_chrome.json | 50 ----------- monitor/.env | 4 +- run_human_examine.sh | 9 -- run_operator.sh | 9 -- 15 files changed, 228 insertions(+), 128 deletions(-) delete mode 100644 evaluation_examples/test_fix_chrome.json delete mode 100644 run_human_examine.sh delete mode 100644 run_operator.sh diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 2efb793..39803b7 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -32,7 +32,7 @@ class DesktopEnv(gym.Env): snapshot_name: str = "init_state", action_space: str = "computer_13", cache_dir: str = "cache", - screen_size: Tuple[int] = (1920, 1080), + screen_size: Tuple[int] = (int(os.environ.get("SCREEN_WIDTH", 1920)), int(os.environ.get("SCREEN_HEIGHT", 1080))), headless: bool = False, require_a11y_tree: bool = True, require_terminal: bool = False, diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index bff1b92..724e2de 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -52,6 +52,11 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: - attribute (str): optional for 'attribute' and 'click_and_attribute', the attribute to be extracted. - backups (Any): The backup information to be returned if the extraction fails. """ + # 添加函数开始日志 + logger.info(f"[INFO_FROM_WEBSITE] Starting to get information from website: {config.get('url', 'N/A')}") + logger.info(f"[INFO_FROM_WEBSITE] Total info operations to perform: {len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Full config: {config}") + try: host = env.vm_ip port = env.chromium_port # fixme: this port is hard-coded, need to be changed from config file @@ -59,11 +64,18 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: remote_debugging_url = f"http://{host}:{port}" backend_url = f"http://{host}:{server_port}" use_proxy = env.current_use_proxy + + logger.info(f"[INFO_FROM_WEBSITE] Connecting to Chrome at {remote_debugging_url}") + with sync_playwright() as p: # connect to remote Chrome instance try: browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to existing Chrome instance") except Exception as e: + logger.warning(f"[INFO_FROM_WEBSITE] Failed to connect to existing Chrome instance: {e}") + logger.info(f"[INFO_FROM_WEBSITE] Starting new Chrome instance...") + # If the connection fails (e.g., the agent close the browser instance), start a new browser instance app = 'chromium' if 'arm' in platform.machine() else 'google-chrome' command = [ @@ -72,52 +84,116 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any: ] if use_proxy: command.append(f"--proxy-server=127.0.0.1:18888") + logger.info(f"[INFO_FROM_WEBSITE] Using proxy server: 127.0.0.1:18888") + + logger.info(f"[INFO_FROM_WEBSITE] Starting browser with command: {' '.join(command)}") payload = json.dumps({"command": command, "shell": False}) headers = {"Content-Type": "application/json"} #requests.post("http://" + host + ":" + server_port + "/setup" + "/launch", headers=headers, data=payload) requests.post(backend_url + "/setup" + "/launch", headers=headers, data=payload) time.sleep(5) browser = p.chromium.connect_over_cdp(remote_debugging_url) + logger.info(f"[INFO_FROM_WEBSITE] Successfully connected to new Chrome instance") page = browser.contexts[0].new_page() + logger.info(f"[INFO_FROM_WEBSITE] Created new page, navigating to: {config['url']}") + page.goto(config["url"]) page.wait_for_load_state('load') + + # 记录页面加载完成后的信息 + logger.info(f"[INFO_FROM_WEBSITE] Page loaded successfully") + logger.info(f"[INFO_FROM_WEBSITE] Page title: '{page.title()}'") + logger.info(f"[INFO_FROM_WEBSITE] Current URL: '{page.url}'") + infos = [] - for info_dict in config.get('infos', []): + for idx, info_dict in enumerate(config.get('infos', [])): + logger.info(f"[INFO_FROM_WEBSITE] Processing info operation {idx + 1}/{len(config.get('infos', []))}") + logger.debug(f"[INFO_FROM_WEBSITE] Info config: {info_dict}") + if page.url != config["url"]: + logger.info(f"[INFO_FROM_WEBSITE] Page URL changed, navigating back to: {config['url']}") page.goto(config["url"]) page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Back to original page") + action = info_dict.get('action', 'inner_text') + selector = info_dict.get('selector') + logger.info(f"[INFO_FROM_WEBSITE] Action: {action}, Selector: {selector}") + if action == "inner_text": + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text: '{extracted_text}'") + infos.append(extracted_text) + elif action == "attribute": + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Waiting for element with selector: {selector}") + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute: {attribute}") ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000) - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}': '{extracted_attr}'") + infos.append(extracted_attr) + elif action == 'click_and_inner_text': + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_inner_text with {len(info_dict['selector'])} selectors") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting inner_text from final element: {sel}") ele = page.wait_for_selector(sel, state='attached', timeout=10000) - infos.append(ele.inner_text()) + extracted_text = ele.inner_text() + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted inner_text after clicks: '{extracted_text}'") + infos.append(extracted_text) + elif action == 'click_and_attribute': + attribute = info_dict.get('attribute') + logger.debug(f"[INFO_FROM_WEBSITE] Performing click_and_attribute with {len(info_dict['selector'])} selectors") + logger.debug(f"[INFO_FROM_WEBSITE] Target attribute: {attribute}") for idx, sel in enumerate(info_dict['selector']): + logger.debug(f"[INFO_FROM_WEBSITE] Processing selector {idx + 1}/{len(info_dict['selector'])}: {sel}") if idx != len(info_dict['selector']) - 1: + logger.debug(f"[INFO_FROM_WEBSITE] Clicking element with selector: {sel}") link = page.wait_for_selector(sel, state='attached', timeout=10000) link.click() page.wait_for_load_state('load') + logger.info(f"[INFO_FROM_WEBSITE] Successfully clicked element, page loaded") + logger.debug(f"[INFO_FROM_WEBSITE] New page URL: {page.url}") else: + logger.debug(f"[INFO_FROM_WEBSITE] Extracting attribute from final element: {sel}") ele = page.wait_for_selector(sel, state='attached') - infos.append(ele.get_attribute(info_dict['attribute'])) + extracted_attr = ele.get_attribute(info_dict['attribute']) + logger.info(f"[INFO_FROM_WEBSITE] Successfully extracted attribute '{attribute}' after clicks: '{extracted_attr}'") + infos.append(extracted_attr) else: + logger.error(f"[INFO_FROM_WEBSITE] Unsupported action: {action}") raise NotImplementedError(f'The action {action} is not supported yet.') + + logger.info(f"[INFO_FROM_WEBSITE] Completed info operation {idx + 1}") + + # 记录最终提取的所有信息 + logger.info(f"[INFO_FROM_WEBSITE] All operations completed successfully") + logger.info(f"[INFO_FROM_WEBSITE] Total extracted information count: {len(infos)}") + logger.info(f"[INFO_FROM_WEBSITE] Final extracted information: {infos}") + return infos except Exception as e: - logger.error(f'[ERROR]: failed to obtain information from the website: {config["url"]}. Use backup results instead.') - return config.get('backups', None) + logger.error(f'[INFO_FROM_WEBSITE] ERROR: Failed to obtain information from website: {config.get("url", "N/A")}') + logger.error(f'[INFO_FROM_WEBSITE] Exception details: {str(e)}') + logger.error(f'[INFO_FROM_WEBSITE] Exception type: {type(e).__name__}') + logger.info(f'[INFO_FROM_WEBSITE] Using backup results instead') + backup_data = config.get('backups', None) + logger.info(f'[INFO_FROM_WEBSITE] Backup data: {backup_data}') + return backup_data # The following ones just need to load info from the files of software, no need to connect to the software diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index 632c53e..6c3811f 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -29,8 +29,8 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any] actual_url = active_tab_info.get('url', None) else: actual_url = active_tab_info - print("expected_url: {}".format(expected_url)) - print("actual_url: {}".format(actual_url)) + logger.info("expected_url: {}".format(expected_url)) + logger.info("actual_url: {}".format(actual_url)) return 1 if compare_urls(expected_url, actual_url) else 0 else: logger.error(f"Unknown type: {match_type}") @@ -76,23 +76,26 @@ def is_expected_url_pattern_match(result, rules) -> float: if type(result) == dict: result_url = result["url"] - print("result url: {}".format(result_url)) + logger.info("result url: {}".format(result_url)) else: result_url = result # expect_regex = re.compile(rules["expected"]) patterns = rules["expected"] - print("expected_regex: {}".format(patterns)) + logger.info("expected_regex: {}".format(patterns)) for pattern in patterns: match = re.search(pattern, result_url) - print(match) + logger.info("match: {}".format(match)) if not match: return 0. return 1. def is_expected_installed_extensions(installed_extensions, expected) -> float: - print("installed_extensions: ") - print(installed_extensions) + if not installed_extensions: + return 0. + + logger.info("installed_extensions: ") + logger.info(installed_extensions) expected_extensions = expected["expected"] # whether the expected extensions are installed @@ -109,6 +112,8 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f """ Checks if the expected tabs are open in Chrome. """ + if not open_tabs: + return 0. match_type = rule['type'] @@ -146,8 +151,10 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None) if liked_authors_folder: # Check if it contains the specified URLs + logger.info("'Liked Authors' folder exists") liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if bookmark['type'] == 'url'] + logger.info("Here is the 'Liked Authors' folder's urls: {}".format(liked_authors_urls)) urls = rule['urls'] @@ -168,6 +175,9 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: def is_expected_search_query(active_tab_info: Dict[str, str], rules: Dict[str, Any]) -> float: + if not active_tab_info: + return 0. + expected = rules['expect'] pattern = expected['pattern'] matched = re.search(pattern, active_tab_info['url']) diff --git a/desktop_env/evaluators/metrics/gimp.py b/desktop_env/evaluators/metrics/gimp.py index 5dddd78..a6dcc29 100644 --- a/desktop_env/evaluators/metrics/gimp.py +++ b/desktop_env/evaluators/metrics/gimp.py @@ -396,7 +396,10 @@ def check_structure_sim_resized(src_path, tgt_path): # Check if the structure is similar structure_same = structure_check_by_ssim(img_src_resized, img_tgt) - return structure_same + if structure_same: + return 1. + else: + return 0. def check_contrast_increase_and_structure_sim(src_path, tgt_path): diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py index 9e888c7..db51850 100644 --- a/desktop_env/evaluators/metrics/table.py +++ b/desktop_env/evaluators/metrics/table.py @@ -463,23 +463,60 @@ def compare_table(result: str, expected: str = None, **options) -> float: # }}} function compare_table # -def compare_csv(result: str, expected: str, **options) -> float: +def compare_csv(result: str, expected: Union[str, List[str]], **options) -> float: + """ + Compare CSV files. If expected is a list, returns 1.0 if result matches any of the expected files. + + Args: + result: Path to result CSV file + expected: Path to expected CSV file or list of paths to expected CSV files + options: Additional options (strict, ignore_case) + + Returns: + 1.0 if result matches expected (or any file in expected list), 0.0 otherwise + """ if result is None: return 0. - with open(result) as f: - result_lines: Iterable[str] = f.read().splitlines() - with open(expected) as f: - expected_lines: Iterable[str] = f.read().splitlines() - if not options.get("strict", True): - result_lines = map(str.strip, result_lines) - expected_lines = map(str.strip, expected_lines) - if options.get("ignore_case", False): - result_lines = map(str.lower, result_lines) - expected_lines = map(str.lower, expected_lines) + try: + with open(result) as f: + result_lines: Iterable[str] = f.read().splitlines() + except (FileNotFoundError, IOError): + return 0. - metric: bool = list(result_lines) == list(expected_lines) - return float(metric) + # Convert expected to list if it's a single string (for backward compatibility) + if isinstance(expected, str): + expected_files = [expected] + else: + expected_files = expected + + # Try to match against each expected file + for expected_file in expected_files: + try: + with open(expected_file) as f: + expected_lines: Iterable[str] = f.read().splitlines() + + # Process lines based on options + current_result_lines = result_lines + current_expected_lines = expected_lines + + if not options.get("strict", True): + current_result_lines = map(str.strip, current_result_lines) + current_expected_lines = map(str.strip, current_expected_lines) + if options.get("ignore_case", False): + current_result_lines = map(str.lower, current_result_lines) + current_expected_lines = map(str.lower, current_expected_lines) + + # Check if this expected file matches + if list(current_result_lines) == list(current_expected_lines): + return 1.0 + + except (FileNotFoundError, IOError): + # If this expected file doesn't exist, continue to next one + continue + + # No match found + return 0.0 def compare_conference_city_in_order(actual_city_list_path, expected_city): diff --git a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json index 547bb28..574d506 100644 --- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json +++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json @@ -1,7 +1,7 @@ { "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", "snapshot": "multiapps", - "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.", + "instruction": "Monitor Ubuntu system resource usage using the sar command from sysstat toolkit. Collect CPU statistics every second for 30 seconds and save the output to 'System_Resources_Report.txt' on Desktop.", "source": "author", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json index 3ad3704..2cb77d1 100644 --- a/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json +++ b/evaluation_examples/examples/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c.json @@ -37,6 +37,7 @@ "check_include_exclude", "compare_csv" ], + "conj": "and", "result": [ { "type": "vm_command_line", @@ -63,8 +64,18 @@ }, { "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", - "dest": "output_gold.csv" + "multi": true, + "path": [ + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold2.csv", + "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3680a5ee-6870-426a-a997-eba929a0d25c/output_gold3.csv" + ], + "dest": [ + "output_gold.csv", + "output_gold2.csv", + "output_gold3.csv" + ], + "gives": [0, 1, 2] } ] }, diff --git a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json index 710ac31..59091fd 100644 --- a/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json +++ b/evaluation_examples/examples/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f.json @@ -144,17 +144,46 @@ "os" ], "evaluator": { - "func": "compare_epub", - "result": { - "type": "vm_file", - "dest": "Pass Through.epub", - "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" - }, - "expected": { - "type": "cloud_file", - "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", - "dest": "Pass Through Gold.epub" - } + "func": [ + "compare_epub", + "compare_epub", + "compare_epub" + ], + "conj": "or", + "result": [ + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/Pass_Through.epub" + }, + { + "type": "vm_file", + "dest": "Pass Through.epub", + "path": "/home/user/Documents/Novels/Pass Through/pass_through.epub" + } + ], + "expected": [ + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + }, + { + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/42d25c08-fb87-4927-8b65-93631280a26f/Pass%20Through.epub", + "dest": "Pass Through Gold.epub" + } + ] }, "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json index 37af93b..a612d38 100644 --- a/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json +++ b/evaluation_examples/examples/multi_apps/a82b78bb-7fde-4cb3-94a4-035baf10bcf0.json @@ -83,21 +83,27 @@ "urls": [ [ "https://jimfan.me/", - "https://research.nvidia.com/person/linxi-jim-fan" + "https://research.nvidia.com/person/linxi-jim-fan", + "https://www.linkedin.com/in/drjimfan/" ], [ "https://research.nvidia.com/person/de-an-huang", - "https://ai.stanford.edu/~dahuang/" + "https://ai.stanford.edu/~dahuang/", + "https://www.linkedin.com/in/de-an-huang-38242a69" ], [ "https://yukezhu.me/", "https://www.cs.utexas.edu/people/faculty-researchers/yuke-zhu", "https://experts.utexas.edu/yuke_zhu", - "https://research.nvidia.com/person/yuke-zhu" + "https://research.nvidia.com/person/yuke-zhu", + "https://www.linkedin.com/in/yukez/" ], [ + "https://tensorlab.cms.caltech.edu/users/anima/", "http://tensorlab.cms.caltech.edu/users/anima/", - "https://www.eas.caltech.edu/people/anima" + "https://www.eas.caltech.edu/people/anima", + "https://en.wikipedia.org/wiki/Anima_Anandkumar", + "https://www.linkedin.com/in/anima-anandkumar/" ] ] } diff --git a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json index 8506dda..9f5f924 100644 --- a/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json +++ b/evaluation_examples/examples/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f.json @@ -11,10 +11,6 @@ { "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character.png", "path": "/home/user/Desktop/character.png" - }, - { - "url": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", - "path": "/home/user/Desktop/character_no_background_gold.png" } ] } @@ -36,8 +32,8 @@ ], "evaluator": { "func": [ - "check_structure_sim_resized", - "check_structure_sim_resized" + "check_structure_sim", + "check_structure_sim" ], "result": [ { @@ -53,13 +49,13 @@ ], "expected": [ { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" }, { - "type": "vm_file", - "path": "/home/user/Desktop/character_no_background_gold.png", + "type": "cloud_file", + "path": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/e8172110-ec08-421b-a6f5-842e6451911f/character_no_background_gold.png", "dest": "character_no_background_gold.png" } ] diff --git a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json index 7dd4f83..2441c3e 100644 --- a/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json +++ b/evaluation_examples/examples/multi_apps/f8cfa149-d1c1-4215-8dac-4a0932bad3c2.json @@ -65,7 +65,7 @@ "type": "rule", "rules": { "expect": { - "pattern": "https?://(www\\.?)?google\\.com/search\\?q=nereida(&|$)" + "pattern": "(?i)https?://(?:www\\.)?google\\.com/search\\?q=nereida(?:&|$|#).*" } } } diff --git a/evaluation_examples/test_fix_chrome.json b/evaluation_examples/test_fix_chrome.json deleted file mode 100644 index 7f9ed93..0000000 --- a/evaluation_examples/test_fix_chrome.json +++ /dev/null @@ -1,50 +0,0 @@ -{ - "chrome": [ - "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", - "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", - "06fe7178-4491-4589-810f-2e2bc9502122", - "e1e75309-3ddb-4d09-92ec-de869c928143", - "35253b65-1c19-4304-8aa4-6884b8218fc0", - "2ad9387a-65d8-4e33-ad5b-7580065a27ca", - "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", - "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", - "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3", - "480bcfea-d68f-4aaa-a0a9-2589ef319381", - "af630914-714e-4a24-a7bb-f9af687d3b91", - "3720f614-37fd-4d04-8a6b-76f54f8c222d", - "99146c54-4f37-4ab8-9327-5f3291665e1e", - "12086550-11c0-466b-b367-1d9e75b3910e", - "6766f2b8-8a72-417f-a9e5-56fcaa735837", - "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9", - "ae78f875-5b98-4907-bbb5-9c737fc68c03", - "3299584d-8f11-4457-bf4c-ce98f7600250", - "030eeff7-b492-4218-b312-701ec99ee0cc", - "9656a811-9b5b-4ddf-99c7-5117bcef0626", - "fc6d8143-9452-4171-9459-7f515143419a", - "a96b564e-dbe9-42c3-9ccf-b4498073438a", - "1704f00f-79e6-43a7-961b-cedd3724d5fd", - "f3b19d1e-2d48-44e9-b4e1-defcae1a0197", - "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a", - "47543840-672a-467d-80df-8f7c3b9788c9", - "c1fa57f3-c3db-4596-8f09-020701085416", - "da46d875-6b82-4681-9284-653b0c7ae241", - "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc", - "f79439ad-3ee8-4f99-a518-0eb60e5652b0", - "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8", - "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805", - "7f52cab9-535c-4835-ac8c-391ee64dc930", - "82279c77-8fc6-46f6-9622-3ba96f61b477", - "2888b4e6-5b47-4b57-8bf5-c73827890774", - "b4f95342-463e-4179-8c3f-193cd7241fb2", - "f5d96daf-83a8-4c86-9686-bada31fc66ab", - "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba", - "368d9ba4-203c-40c1-9fa3-da2f1430ce63", - "59155008-fe71-45ec-8a8f-dc35497b6aa8", - "a728a36e-8bf1-4bb6-9a03-ef039a5233f0", - "b070486d-e161-459b-aa2b-ef442d973b92", - "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", - "9f935cce-0a9f-435f-8007-817732bfc0a5", - "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", - "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" - ] -} \ No newline at end of file diff --git a/monitor/.env b/monitor/.env index 62ba076..05618af 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_fix_chrome.json +TASK_CONFIG_PATH=../evaluation_examples/test_all.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_chrome_operator +RESULTS_BASE_PATH=../results_all ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview diff --git a/run_human_examine.sh b/run_human_examine.sh deleted file mode 100644 index c8e8447..0000000 --- a/run_human_examine.sh +++ /dev/null @@ -1,9 +0,0 @@ -python manual_examine.py \ - --headless \ - --observation_type screenshot \ - --result_dir ./results_human_examine_chrome_fix_1 \ - --test_all_meta_path evaluation_examples/test_fix_chrome.json \ - --region us-east-1 \ - --domain chrome \ - --example_id 030eeff7-b492-4218-b312-701ec99ee0cc \ - --max_steps 3 \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh deleted file mode 100644 index 9cb6ccc..0000000 --- a/run_operator.sh +++ /dev/null @@ -1,9 +0,0 @@ -python run_multienv_openaicua.py \ ---headless \ ---observation_type screenshot \ ---model computer-use-preview \ ---result_dir ./results_multiapps_operator \ ---test_all_meta_path evaluation_examples/test_multiapps.json \ ---region us-east-1 \ ---max_steps 150 \ ---num_envs 5 \ No newline at end of file