From ea51f5264a1bb4f616a7dbca63ce4c5412b303f3 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Mon, 30 Jun 2025 08:07:24 +0000 Subject: [PATCH] fix chrome --- desktop_env/evaluators/getters/__init__.py | 3 +- desktop_env/evaluators/getters/chrome.py | 169 +++++++++- desktop_env/evaluators/getters/misc.py | 40 ++- desktop_env/evaluators/metrics/__init__.py | 3 +- desktop_env/evaluators/metrics/chrome.py | 35 +- desktop_env/providers/aws/manager.py | 2 +- .../12086550-11c0-466b-b367-1d9e75b3910e.json | 2 +- .../1704f00f-79e6-43a7-961b-cedd3724d5fd.json | 28 +- .../2888b4e6-5b47-4b57-8bf5-c73827890774.json | 22 +- .../35253b65-1c19-4304-8aa4-6884b8218fc0.json | 4 +- .../6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json | 10 +- .../7f52cab9-535c-4835-ac8c-391ee64dc930.json | 60 +++- .../9f3f70fc-5afc-4958-a7b7-3bb4fcb01805.json | 60 ++-- .../b4f95342-463e-4179-8c3f-193cd7241fb2.json | 2 +- .../c1fa57f3-c3db-4596-8f09-020701085416.json | 4 +- .../cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json | 38 ++- .../da46d875-6b82-4681-9284-653b0c7ae241.json | 21 +- .../e1e75309-3ddb-4d09-92ec-de869c928143.json | 2 +- .../f0b971a1-6831-4b9b-a50e-22a6e47f45ba.json | 4 +- .../f5d96daf-83a8-4c86-9686-bada31fc66ab.json | 2 +- .../5990457f-2adb-467b-a4af-5c857c92d762.json | 2 +- .../settings/proxy/dataimpulse.json | 4 +- evaluation_examples/test_fix_chrome.json | 50 +++ manual_examine.py | 309 ++++++++++++++++++ monitor/.env | 4 +- run_human_examine.sh | 9 + run_operator.sh | 9 + 27 files changed, 781 insertions(+), 117 deletions(-) create mode 100644 evaluation_examples/test_fix_chrome.json create mode 100644 manual_examine.py create mode 100644 run_human_examine.sh create mode 100644 run_operator.sh diff --git a/desktop_env/evaluators/getters/__init__.py b/desktop_env/evaluators/getters/__init__.py index a035e27..26bf0d9 100644 --- a/desktop_env/evaluators/getters/__init__.py +++ b/desktop_env/evaluators/getters/__init__.py @@ -25,7 +25,8 @@ from .chrome import ( get_url_dashPart, get_active_url_from_accessTree, get_find_installed_extension_name, - get_info_from_website + get_info_from_website, + get_url_path_parse ) from .file import get_cloud_file, get_vm_file, get_cache_file, get_content_from_vm_file from .general import get_vm_command_line, get_vm_terminal_output, get_vm_command_error diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index 602aec5..02a9d99 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -1200,6 +1200,32 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]): elements = target_page.query_selector_all(selector) return [element.text_content().strip() for element in elements if element] + def safely_get_direct_text_nodes_playwright(selector): + """ + Extract all direct text node contents under the specified selector element (excluding text inside child div, span, etc.). + Returns a list of lists, each sublist contains the direct text nodes of one element. + Suitable for structures like:
SEA
NYC
+ """ + elements = target_page.query_selector_all(selector) + results = [] + for element in elements: + texts = element.evaluate(''' + (node) => Array.from(node.childNodes) + .filter(n => n.nodeType === Node.TEXT_NODE) + .map(n => n.textContent.trim()) + .filter(Boolean) + ''') + results.append(texts) + return results[0] + + def safely_get_direct_li_playwright(selector): + elements = target_page.query_selector_all(selector + " li.catAllProducts") + return [element.query_selector('span').inner_text().strip() for element in elements if element.query_selector('span')] + + def safely_get_only_child_text_content(selector): + elements = target_page.query_selector_all(selector) + return [element.query_selector('h3').text_content().strip() for element in elements if element.query_selector('h3')] + if config["category"] == "class": class_multiObject = config.get("class_multiObject", {}) for class_name, object_dict in class_multiObject.items(): @@ -1208,6 +1234,41 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]): index = int(order_key) if len(elements_texts) > index: return_json[key] = elements_texts[index] + + class_multiObject_child = config.get("class_multiObject_child", {}) + for class_name, object_dict in class_multiObject_child.items(): + elements_texts = safely_get_direct_text_nodes_playwright("." + class_name) + for order_key, key in object_dict.items(): + index = int(order_key) + if len(elements_texts) > index: + return_json[key] = elements_texts[index] + + class_multiObject_only_child = config.get("class_multiObject_only_child", {}) + for class_name, object_dict in class_multiObject_only_child.items(): + elements_texts = safely_get_only_child_text_content("." + class_name) + for order_key, key in object_dict.items(): + index = int(order_key) + if len(elements_texts) > index: + return_json[key] = elements_texts[index] + + class_multiObject_search_exist = config.get("class_multiObject_search_exist", {}) + for class_name, object_list in class_multiObject_search_exist.items(): + elements_texts = safely_get_text_content("." + class_name) + for each_object in object_list: + if each_object == "is_other_exist": + continue + if each_object in elements_texts: + return_json[each_object] = True + else: + return_json[each_object] = False + if "is_other_exist" in object_list: + for each_element in elements_texts: + if each_element not in object_list: + return_json["is_other_exist"] = True + break + if "is_other_exist" not in return_json.keys(): + return_json["is_other_exist"] = False + class_singleObject = config.get("class_singleObject", {}) for class_name, key in class_singleObject.items(): @@ -1236,6 +1297,55 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]): inputs = target_page.locator(f"xpath={xpath}") if inputs.count() > 0: return_json[key] = inputs.first.input_value().strip() + + elif config["category"] == "class&url": + class_multiObject = config.get("class_multiObject", {}) + for class_name, object_list in class_multiObject.items(): + elements_texts = safely_get_text_content("." + class_name) + for each_key in object_list: + if any(each_key.lower() == text.lower() for text in elements_texts): + return_json[each_key.lower()] = True + + for each_key in elements_texts: + # each_key.lower() not in object_list.lower(): + if all(each_key.lower() not in item.lower() for item in object_list): + return_json["is_other_exist"] = True + break + if "is_other_exist" not in return_json.keys(): + return_json["is_other_exist"] = False + + class_multiObject_li = config.get("class_multiObject_li", {}) + for class_name, object_list in class_multiObject_li.items(): + elements_texts = safely_get_direct_li_playwright("." + class_name) + for each_key in object_list: + if any(each_key.lower() == text.lower() for text in elements_texts): + return_json[each_key.lower()] = True + + for each_key in elements_texts: + # each_key.lower() not in object_list.lower(): + if all(each_key.lower() not in item.lower() for item in object_list): + return_json["is_other_exist"] = True + break + if "is_other_exist" not in return_json.keys(): + return_json["is_other_exist"] = False + + url_include_expected = config.get("url_include_expected", []) + for key in url_include_expected: + if key.lower() in target_page.url.lower(): + if key.lower() not in return_json.keys(): + return_json[key.lower()] = True + else: + if key.lower() not in return_json.keys(): + return_json[key.lower()] = False + + url_include_expected_multichoice = config.get("url_include_expected_multichoice", {}) + for key, value in url_include_expected_multichoice.items(): + if key.lower() in target_page.url.lower(): + if value.lower() not in return_json.keys(): + return_json[value.lower()] = True + else: + if value.lower() not in return_json.keys(): + return_json[value.lower()] = False browser.close() return return_json @@ -1274,13 +1384,14 @@ def get_gotoRecreationPage_and_get_html_content(env, config: Dict[str, Any]): browser = p.chromium.connect_over_cdp(remote_debugging_url) page = browser.new_page() page.goto("https://www.recreation.gov/") - page.fill("input#hero-search-input", "Albion Basin") + page.fill("input#hero-search-input", "Diamond") page.click("button.nav-search-button") print("after first click") - time.sleep(2) + time.sleep(10) # Assuming .search-result-highlight--success leads to a new page or requires page load with page.expect_popup() as popup_info: page.click(".search-result-highlight--success") + time.sleep(30) print("after second click") newpage = popup_info.value newpage.wait_for_load_state() @@ -1362,3 +1473,57 @@ def get_url_dashPart(env, config: Dict[str, str]): return dash_part elif config["returnType"] == "json": return {config["key"]: dash_part} + + +def get_url_path_parse(env, config: Dict[str, str]): + """ + Parse Macy's product url path, extract: + - mens_clothing: true if 'mens-clothing' in path, else None + - t_shirts: true if any key 'Top_style' or 'Product_department' value is 'T-shirts', else None + - Men_regular_size_t, Price_discount_range (as list), Sleeve_length: as before, None if not found + All fields are None if not found for robustness. + """ + from urllib.parse import urlparse, unquote + result = {} + # 1. Parse URL + active_tab_url = get_active_url_from_accessTree(env, config) + if active_tab_url is None: + return None + parsed = urlparse(active_tab_url) + path = unquote(parsed.path) + result = {} + # mens_clothing + result['mens_clothing'] = True if 'mens-clothing' in path else None + # key-value + path_parts = path.strip('/').split('/') + key_value_json = {} + tshirts_flag = False + if "mens-t-shirts" in path: + tshirts_flag = True + for i in range(len(path_parts)-1): + if ',' in path_parts[i] and ',' in path_parts[i+1]: + keys = [k.strip() for k in path_parts[i].split(',')] + values = [v.strip() for v in path_parts[i+1].split(',')] + for k, v in zip(keys, values): + if k == "Price_discount_range": + key_value_json[k] = [item.strip() for item in v.split('|')] if v else None + else: + key_value_json[k] = v if v else None + if (k == 'Top_style' or k == 'Product_department') and (v == 'T-shirts' or v == 'T-Shirts' or v == 'T-Shirt'): + tshirts_flag = True + break + for field in ['Men_regular_size_t', 'Price_discount_range', 'Sleeve_length']: + if field not in key_value_json: + key_value_json[field] = None + result['t_shirts'] = tshirts_flag if tshirts_flag else None + # parse_keys + for key in config["parse_keys"]: + if key in key_value_json: + if key == "Price_discount_range": + if '50_PERCENT_ off & more' in key_value_json[key] and not '30_PERCENT_ off & more' in key_value_json[key] and not '20_PERCENT_ off & more' in key_value_json[key]: + result[key] = '50_PERCENT_ off & more' + else: + result[key] = 'not_50_PERCENT_ off & more' + else: + result[key] = key_value_json[key] + return result diff --git a/desktop_env/evaluators/getters/misc.py b/desktop_env/evaluators/getters/misc.py index 8862438..8b7f81e 100644 --- a/desktop_env/evaluators/getters/misc.py +++ b/desktop_env/evaluators/getters/misc.py @@ -71,7 +71,10 @@ relativeTime_to_IntDay = { "this Sunday": "special", "next Monday": "special", "next Friday": "special", - "first monday four months later": "special" + "first monday four months later": "special", + "first monday eight months later": "special", + "next Monday split": "special", + "next Friday split": "special" } def get_rule(env, config: Dict[str, R]) -> R: @@ -125,6 +128,12 @@ def get_rule_relativeTime(env, config: Dict[str, R]) -> R: # get the first monday of the next_month temp_date = datetime(next_year, next_month, 1) absoluteDay = temp_date + timedelta(days=((6-temp_date.weekday())+1)%7) + elif start_relative_time == "first monday eight months later": + next_year = now.year + 1 if now.month >= 5 else now.year + next_month = (now.month + 8)%12 + # get the first monday of the next_month + temp_date = datetime(next_year, next_month, 1) + absoluteDay = temp_date + timedelta(days=((6-temp_date.weekday())+1)%7) regular_time = apply_rules_to_timeFormat(relativeRules["expected"]["time"], absoluteDay) config["rules"]["expected"]["time"] = regular_time @@ -144,12 +153,20 @@ def get_rule_relativeTime(env, config: Dict[str, R]) -> R: next_month = now.month + 1 if now.month < 12 else 1 next_day = 10 from_absoluteDay = datetime(next_year, next_month, next_day) - elif from_time == "next Monday": + elif from_time == "next Monday" or from_time == "next Monday split": from_absoluteDay = now + timedelta(days=((6-now.weekday())+1)) else: pass # more rules here - regular_from_time = apply_rules_to_timeFormat(relativeRules["expected"]["from"], from_absoluteDay) - config["rules"]["expected"]["from"] = regular_from_time + if from_time == "next Monday split": + puday = apply_rules_to_timeFormat(relativeRules["expected"]["puDay"], from_absoluteDay) + config["rules"]["expected"]["puDay"] = puday + pumonth = apply_rules_to_timeFormat(relativeRules["expected"]["puMonth"], from_absoluteDay) + config["rules"]["expected"]["puMonth"] = pumonth + puyear = apply_rules_to_timeFormat(relativeRules["expected"]["puYear"], from_absoluteDay) + config["rules"]["expected"]["puYear"] = puyear + else: + regular_from_time = apply_rules_to_timeFormat(relativeRules["expected"]["from"], from_absoluteDay) + config["rules"]["expected"]["from"] = regular_from_time # deal with to_time if relativeTime_to_IntDay[to_time] != "special": @@ -164,15 +181,23 @@ def get_rule_relativeTime(env, config: Dict[str, R]) -> R: next_month = now.month + 1 if now.month < 12 else 1 next_day = 11 to_absoluteDay = datetime(next_year, next_month, next_day) - elif to_time == "next Friday": + elif to_time == "next Friday" or to_time == "next Friday split": if now.weekday() < 4 and from_time in ["next Monday"]: to_absoluteDay = now + timedelta(days=((4-now.weekday())+7)) else: to_absoluteDay = now + timedelta(days=((4-now.weekday()) if now.weekday() < 4 else (6-now.weekday()) + 5)) else: pass # more rules here - regular_to_time = apply_rules_to_timeFormat(relativeRules["expected"]["to"], to_absoluteDay) - config["rules"]["expected"]["to"] = regular_to_time + if to_time == "next Friday split": + to_day = apply_rules_to_timeFormat(relativeRules["expected"]["doDay"], to_absoluteDay) + config["rules"]["expected"]["doDay"] = to_day + to_month = apply_rules_to_timeFormat(relativeRules["expected"]["doMonth"], to_absoluteDay) + config["rules"]["expected"]["doMonth"] = to_month + to_year = apply_rules_to_timeFormat(relativeRules["expected"]["doYear"], to_absoluteDay) + config["rules"]["expected"]["doYear"] = to_year + else: + regular_to_time = apply_rules_to_timeFormat(relativeRules["expected"]["to"], to_absoluteDay) + config["rules"]["expected"]["to"] = regular_to_time return config["rules"] @@ -186,6 +211,7 @@ def apply_rules_to_timeFormat(timeFormat: str, absoluteDay: datetime): timeFormat = timeFormat.replace("{month}", month_mapping_full[absoluteDay.month], 1) timeFormat = timeFormat.replace("{MonthFull}", Month_Mapping_Full[absoluteDay.month], 1) timeFormat = timeFormat.replace("{Day0D}", "0"+str(absoluteDay.day) if absoluteDay.day < 10 else str(absoluteDay.day), 1) + timeFormat = timeFormat.replace("{MonthD}", str(absoluteDay.month), 1) # you can add other replace rules here return timeFormat diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 19a450d..c628206 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -21,7 +21,8 @@ from .chrome import ( is_expected_url_pattern_match, is_added_to_steam_cart, is_expected_installed_extensions, - compare_pdf_images + compare_pdf_images, + is_expected_active_tab_approximate ) from .docs import ( compare_font_names, diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index cc12e29..8b04d31 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -36,6 +36,34 @@ def is_expected_active_tab(active_tab_info: Dict[str, str], rule: Dict[str, Any] return 0 +def is_expected_active_tab_approximate(active_tab_info: Dict[str, str], rule: Dict[str, Any]) -> float: + """ + Checks if the expected active tab is open in Chrome, ignoring query parameters in the URL. + """ + if not active_tab_info: + return 0. + + match_type = rule['type'] + + if match_type == "url": + expected_url = rule['url'] + if isinstance(active_tab_info, Dict): + actual_url = active_tab_info.get('url', None) + else: + actual_url = active_tab_info + from urllib.parse import urlparse, urlunparse + def strip_query(url): + parsed = urlparse(url) + return urlunparse(parsed._replace(query="")) + if strip_query(expected_url) == strip_query(actual_url): + return 1 + else: + return 0 + else: + logger.error(f"Unknown type: {match_type}") + return 0 + + # rules[expected] is a string-formatted regex def is_expected_url_pattern_match(result, rules) -> float: """ @@ -335,7 +363,12 @@ def is_shortcut_on_desktop(shortcuts: Dict[str, str], rule): for shortcut_path, shortcut_content in shortcuts.items(): if "Name=" + rule['name'] + "\n" in shortcut_content: return 1. - return 0. + return 0.0 + elif rule['type'] == 'exec': + for shortcut_path, shortcut_content in shortcuts.items(): + if "Exec=" + rule['exec'] + "\n" in shortcut_content: + return 1. + return 0.0 elif rule['type'] == 'url': raise TypeError(f"{rule['type']} not support yet!") elif rule['type'] == 'id': diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 6e6dafb..70e78ed 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -33,7 +33,7 @@ DEFAULT_REGION = "us-east-1" # todo: public the AMI images # ami-05e7d7bd279ea4f14 IMAGE_ID_MAP = { - "us-east-1": "ami-00674d875de9addc1", + "us-east-1": "ami-03a22c6e501415fb1", "ap-east-1": "ami-0c092a5b8be4116f5", } diff --git a/evaluation_examples/examples/chrome/12086550-11c0-466b-b367-1d9e75b3910e.json b/evaluation_examples/examples/chrome/12086550-11c0-466b-b367-1d9e75b3910e.json index 0f168e1..2ab898b 100644 --- a/evaluation_examples/examples/chrome/12086550-11c0-466b-b367-1d9e75b3910e.json +++ b/evaluation_examples/examples/chrome/12086550-11c0-466b-b367-1d9e75b3910e.json @@ -29,7 +29,7 @@ "chrome" ], "evaluator": { - "func": "is_expected_active_tab", + "func": "is_expected_active_tab_approximate", "result": { "type": "active_url_from_accessTree", "goto_prefix": "" diff --git a/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json b/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json index f486e25..ea61155 100644 --- a/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json +++ b/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json @@ -59,13 +59,16 @@ ] }, { - "type": "active_tab_html_parse", + "type": "active_tab_url_parse", "goto_prefix": "https://www.", - "category": "xpath", - "xpathObject": { - "/html/body/main/div/div/div/section/div/div/div/div[1]/div[1]/p": "from", - "/html/body/main/div/div/div/section/div/div/div/div[1]/div[3]/p": "to" - } + "parse_keys": [ + "puDay", + "puMonth", + "puYear", + "doDay", + "doMonth", + "doYear" + ] } ], "expected": [ @@ -77,6 +80,7 @@ "dropLocationName": "Zürich", "filterCriteria_carCategory": "large", "filterCriteria_sortBy": "PRICE" + } } }, @@ -84,12 +88,16 @@ "type": "rule_relativeTime", "rules": { "relativeTime": { - "from": "next Monday", - "to": "next Friday" + "from": "next Monday split", + "to": "next Friday split" }, "expected": { - "from": "{DoW}, {DayD} {Month} {Year}, 10:00", - "to": "{DoW}, {DayD} {Month} {Year}, 10:00" + "puDay": "{DayD}", + "puMonth": "{MonthD}", + "puYear": "{Year}", + "doDay": "{DayD}", + "doMonth": "{MonthD}", + "doYear":"{Year}" } } } diff --git a/evaluation_examples/examples/chrome/2888b4e6-5b47-4b57-8bf5-c73827890774.json b/evaluation_examples/examples/chrome/2888b4e6-5b47-4b57-8bf5-c73827890774.json index ad7842e..b33b9dd 100644 --- a/evaluation_examples/examples/chrome/2888b4e6-5b47-4b57-8bf5-c73827890774.json +++ b/evaluation_examples/examples/chrome/2888b4e6-5b47-4b57-8bf5-c73827890774.json @@ -43,18 +43,28 @@ "chrome" ], "evaluator": { - "func": "exact_match", + "func": "check_direct_json_object", "result": { - "type": "url_dashPart", + "type": "url_path_parse", "goto_prefix": "https://www.", - "partIndex": -1, - "needDeleteId": true, - "returnType": "string" + "parse_keys": [ + "mens_clothing", + "t_shirts", + "Men_regular_size_t", + "Price_discount_range", + "Sleeve_length" + ] }, "expected": { "type": "rule", "rules": { - "expected": "Stripe,Men,L,Short%20Sleeve,Sales%20%26%20Discounts" + "expected": { + "mens_clothing": true, + "t_shirts": true, + "Men_regular_size_t": "L", + "Price_discount_range": "50_PERCENT_ off & more", + "Sleeve_length": "Short Sleeve" + } } } }, diff --git a/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json b/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json index b549f24..79dd4f4 100644 --- a/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json +++ b/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json @@ -44,8 +44,8 @@ "expected": { "type": "rule", "rules": { - "type": "name", - "name": "Play Puzzle Game 2048" + "type": "exec", + "exec": "/opt/google/chrome/google-chrome --profile-directory=Default --app-id=poahllcmmahlafplfhgjomkjmeblpapf" } } }, diff --git a/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json b/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json index 6133680..d63ec3f 100644 --- a/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json +++ b/evaluation_examples/examples/chrome/6c4c23a1-42a4-43cc-9db1-2f86ff3738cc.json @@ -49,11 +49,11 @@ "goto_prefix": "https://www.", "category": "class", "class_singleObject": { - "search-date": "time", - "price-in-tabs__nav--selected": "category" + "mach-flight-context-info__wrapper--date": "time", + "mach-global-tabs-small__wrapper__tab--active": "category" }, - "class_multiObject": { - "search-segment-cities__city": { + "class_multiObject_child": { + "mach-flight-context-info__wrapper__info--separator": { "0": "start", "1": "end" } @@ -68,7 +68,7 @@ "expected": { "start": "SEA", "end": "NYC", - "time": "{DoW}, {Month} {DayD}, {Year}", + "time": "{DoW}, {Month} {Day0D}, {Year}", "category": "Miles" } } diff --git a/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json b/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json index 5f1288c..d2fe638 100644 --- a/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json +++ b/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json @@ -43,24 +43,52 @@ "chrome" ], "evaluator": { - "func": "check_direct_json_object", - "result": { - "type": "active_tab_url_parse", - "goto_prefix": "https://www.", - "parse_keys": [ - "q", - "tbs" - ] - }, - "expected": { - "type": "rule", - "rules": { - "expected": { - "q": "drip coffee maker", - "tbs": "mr:1,price:1,ppr_min:25,ppr_max:60,sales:1,pdtr0:1825161|1825162" + "func": [ + "check_direct_json_object", + "check_direct_json_object" + ], + "result": [ + { + "type": "active_tab_url_parse", + "goto_prefix": "https://www.", + "parse_keys": [ + "q" + ] + }, + { + "type": "active_tab_html_parse", + "goto_prefix": "https://www.", + "category": "class", + "class_multiObject_search_exist": { + "fT28tf":[ + "Black", + "$25 - $60", + "is_other_exist" + ] } } - } + ], + "expected": [ + { + "type": "rule", + "rules": { + "expected": { + "q": "drip coffee maker" + }, + "expect_in_result": true + } + }, + { + "type": "rule", + "rules": { + "expected": { + "Black": true, + "$25 - $60": true, + "is_other_exist": false + } + } + } + ] }, "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/9f3f70fc-5afc-4958-a7b7-3bb4fcb01805.json b/evaluation_examples/examples/chrome/9f3f70fc-5afc-4958-a7b7-3bb4fcb01805.json index 2fd8b0d..37443ae 100644 --- a/evaluation_examples/examples/chrome/9f3f70fc-5afc-4958-a7b7-3bb4fcb01805.json +++ b/evaluation_examples/examples/chrome/9f3f70fc-5afc-4958-a7b7-3bb4fcb01805.json @@ -43,41 +43,37 @@ "chrome" ], "evaluator": { - "func": [ - "is_expected_url_pattern_match", - "check_direct_json_object" - ], - "conj": "and", - "result": [ - { - "type": "active_tab_info" + "func": "check_direct_json_object", + "result": { + "type": "active_tab_html_parse", + "category": "class&url", + "class_multiObject": { + "filter-selector-link": [ + "over $60", + "women", + "jerseys", + "nike" + ] }, - { - "type": "active_tab_html_parse", - "category": "xpath", - "xpathObject": { - "/html/body/div[2]/div/div[6]/div[2]/div[2]/div/div[1]/div[4]/ul/li[2]": "money" + "url_include_expected": [ + "over $60", + "women", + "jerseys", + "nike" + ] + }, + "expected": { + "type": "rule", + "rules": { + "expected": { + "over $60": true, + "women": true, + "jerseys": true, + "nike": true, + "is_other_exist": false } } - ], - "expected": [ - { - "type": "rule", - "rules": { - "expected": [ - "/women-jerseys/" - ] - } - }, - { - "type": "rule", - "rules": { - "expected": { - "money": "over $60" - } - } - } - ] + } }, "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index 4c2fd05..6652ab7 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "Find the next available date for Albion Basin.", + "instruction": "Find the next available date for Diamond.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/c1fa57f3-c3db-4596-8f09-020701085416.json b/evaluation_examples/examples/chrome/c1fa57f3-c3db-4596-8f09-020701085416.json index e20905f..f12ddf0 100644 --- a/evaluation_examples/examples/chrome/c1fa57f3-c3db-4596-8f09-020701085416.json +++ b/evaluation_examples/examples/chrome/c1fa57f3-c3db-4596-8f09-020701085416.json @@ -1,7 +1,7 @@ { "id": "c1fa57f3-c3db-4596-8f09-020701085416", "snapshot": "chrome", - "instruction": "Open the baggage fee calculator.", + "instruction": "Open the baggage fee calculator in United Airlines website.", "source": "test_task_1", "config": [ { @@ -62,7 +62,7 @@ "type": "rule", "rules": { "expected": [ - "checked-bag-fee-calculator" + "united.com/en/us/checked-bag-fee-calculator" ] } } diff --git a/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json b/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json index 567a89d..ec98f63 100644 --- a/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json +++ b/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json @@ -43,21 +43,41 @@ "chrome" ], "evaluator": { - "func": "is_expected_url_pattern_match", + "func": "check_direct_json_object", "result": { - "type": "active_url_from_accessTree", - "goto_prefix": "https://www." + "type": "active_tab_html_parse", + "goto_prefix": "https://www.", + "category": "class&url", + "class_multiObject_li": { + "pmpSearch_breadcrumb": [ + "Spider-Man", + "Toys", + "Kids" + ], + "sbSelector": [ + "Price Low-High" + ] + }, + "url_include_expected_multichoice": { + "Spider-Man": "Spider-Man", + "spiderman": "Spider-Man", + "Toys": "Toys", + "Kids": "Kids", + "S=4": "Price Low-High" + } }, "expected": { "type": "rule", "rules": { - "expected": [ - "AgeAppropriate:Kids", - "search=spider[-%20]?man%20toys", - "S=4" - ] + "expected": { + "spider-man": true, + "toys": true, + "kids": true, + "price low-high": true, + "is_other_exist": false + } } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/da46d875-6b82-4681-9284-653b0c7ae241.json b/evaluation_examples/examples/chrome/da46d875-6b82-4681-9284-653b0c7ae241.json index 48e7c15..5148fb7 100644 --- a/evaluation_examples/examples/chrome/da46d875-6b82-4681-9284-653b0c7ae241.json +++ b/evaluation_examples/examples/chrome/da46d875-6b82-4681-9284-653b0c7ae241.json @@ -1,7 +1,7 @@ { "id": "da46d875-6b82-4681-9284-653b0c7ae241", "snapshot": "chrome", - "instruction": "Schedule an appointment to apply for transportation access pass in the Charlie Card store on the first Monday four months later, 10:15 am, fill in my details (James Smith, james.smith@gmail.com). And don not click \"book\" directly. Let me review it.", + "instruction": "Book an appointment to apply for a transportation access pass at the Charlie Card store on the first Monday eight months later, 10:15 am, fill in my details (James Smith, james.smith@gmail.com). And don not click \"book\" directly. Let me review it.", "source": "test_task_2", "config": [ { @@ -56,11 +56,10 @@ { "type": "active_tab_html_parse", "category": "class", - "class_singleObject": {}, - "class_multiObject": { - "breakword": { - "1": "content", - "2": "time" + "class_multiObject_only_child": { + "HAZ16": { + "0": "content", + "1": "time" } } }, @@ -68,8 +67,8 @@ "type": "active_tab_html_parse", "category": "input", "inputObject": { - "/html/body/div/div/form/div[7]/div/div/div[1]/input[1]": "name", - "/html/body/div/div/form/div[7]/div/div/div[1]/input[2]": "mail" + "/html/body/div[2]/div/form/div[7]/div/div/div[1]/input[1]": "name", + "/html/body/div[2]/div/form/div[7]/div/div/div[1]/input[2]": "mail" } } ], @@ -78,7 +77,7 @@ "type": "rule", "rules": { "expected": [ - "CharlieCardStoreAppointments@mbta.com/bookings/" + "book/CharlieCardStoreAppointments@mbta.com/" ] } }, @@ -86,11 +85,11 @@ "type": "rule_relativeTime", "rules": { "relativeTime": { - "from": "first monday four months later" + "from": "first monday eight months later" }, "expected": { "content": "Apply for Transportation Access Pass (TAP) CharlieCard non-auto approval", - "time": "{MonthFull} {Day0D}, 10:15 am" + "time": "{MonthFull} {Day0D}, 10:15 AM" } } }, diff --git a/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json b/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json index c72a212..fdf471f 100644 --- a/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json +++ b/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json @@ -1,7 +1,7 @@ { "id": "e1e75309-3ddb-4d09-92ec-de869c928143", "snapshot": "chrome", - "instruction": "Computer, can you turn the webpage I'm looking at into a PDF file and put it on my main screen, you know, the Desktop?", + "instruction": "Computer, can you turn the webpage I'm looking at into a PDF file, save it to my Desktop with the default filename and set the margins to none?", "source": "https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php", "config": [ { diff --git a/evaluation_examples/examples/chrome/f0b971a1-6831-4b9b-a50e-22a6e47f45ba.json b/evaluation_examples/examples/chrome/f0b971a1-6831-4b9b-a50e-22a6e47f45ba.json index a767e34..38e2f65 100644 --- a/evaluation_examples/examples/chrome/f0b971a1-6831-4b9b-a50e-22a6e47f45ba.json +++ b/evaluation_examples/examples/chrome/f0b971a1-6831-4b9b-a50e-22a6e47f45ba.json @@ -1,7 +1,7 @@ { "id": "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", "snapshot": "chrome", - "instruction": "Show me the scores for the 2019 super bowl", + "instruction": "Please help me find the score record for the 2019 Super Bowl in the NFL website.", "source": "Mind2Web", "config": [ { @@ -62,7 +62,7 @@ "type": "rule", "rules": { "type": "url", - "url": "https://www.nfl.com/scores/2019/POST4" + "url": "https://www.nfl.com/scores/2019/post4" } } }, diff --git a/evaluation_examples/examples/chrome/f5d96daf-83a8-4c86-9686-bada31fc66ab.json b/evaluation_examples/examples/chrome/f5d96daf-83a8-4c86-9686-bada31fc66ab.json index 580cc8b..29c13b0 100644 --- a/evaluation_examples/examples/chrome/f5d96daf-83a8-4c86-9686-bada31fc66ab.json +++ b/evaluation_examples/examples/chrome/f5d96daf-83a8-4c86-9686-bada31fc66ab.json @@ -1,7 +1,7 @@ { "id": "f5d96daf-83a8-4c86-9686-bada31fc66ab", "snapshot": "chrome", - "instruction": "Compare iPhone 15 Pro Max with iPhone 13 Pro Max", + "instruction": "Compare iPhone 15 Pro Max with iPhone 14 Pro Max and iPhone 13 Pro Max", "source": "Mind2Web", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json b/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json index 7bf0404..695a9d2 100644 --- a/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json +++ b/evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json @@ -104,5 +104,5 @@ "ignore_case": true } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/settings/proxy/dataimpulse.json b/evaluation_examples/settings/proxy/dataimpulse.json index 2e7e65a..34b7d59 100644 --- a/evaluation_examples/settings/proxy/dataimpulse.json +++ b/evaluation_examples/settings/proxy/dataimpulse.json @@ -2,8 +2,8 @@ { "host": "gw.dataimpulse.com", "port": 823, - "username": "your_username", - "password": "your_password", + "username": "fba5ac061fe18be70c6c", + "password": "3b5669b6640fc80c", "protocol": "http", "provider": "dataimpulse", "type": "residential", diff --git a/evaluation_examples/test_fix_chrome.json b/evaluation_examples/test_fix_chrome.json new file mode 100644 index 0000000..7f9ed93 --- /dev/null +++ b/evaluation_examples/test_fix_chrome.json @@ -0,0 +1,50 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "06fe7178-4491-4589-810f-2e2bc9502122", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "af630914-714e-4a24-a7bb-f9af687d3b91", + "3720f614-37fd-4d04-8a6b-76f54f8c222d", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "12086550-11c0-466b-b367-1d9e75b3910e", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9", + "ae78f875-5b98-4907-bbb5-9c737fc68c03", + "3299584d-8f11-4457-bf4c-ce98f7600250", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "9656a811-9b5b-4ddf-99c7-5117bcef0626", + "fc6d8143-9452-4171-9459-7f515143419a", + "a96b564e-dbe9-42c3-9ccf-b4498073438a", + "1704f00f-79e6-43a7-961b-cedd3724d5fd", + "f3b19d1e-2d48-44e9-b4e1-defcae1a0197", + "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a", + "47543840-672a-467d-80df-8f7c3b9788c9", + "c1fa57f3-c3db-4596-8f09-020701085416", + "da46d875-6b82-4681-9284-653b0c7ae241", + "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc", + "f79439ad-3ee8-4f99-a518-0eb60e5652b0", + "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8", + "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805", + "7f52cab9-535c-4835-ac8c-391ee64dc930", + "82279c77-8fc6-46f6-9622-3ba96f61b477", + "2888b4e6-5b47-4b57-8bf5-c73827890774", + "b4f95342-463e-4179-8c3f-193cd7241fb2", + "f5d96daf-83a8-4c86-9686-bada31fc66ab", + "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba", + "368d9ba4-203c-40c1-9fa3-da2f1430ce63", + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "a728a36e-8bf1-4bb6-9a03-ef039a5233f0", + "b070486d-e161-459b-aa2b-ef442d973b92", + "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", + "9f935cce-0a9f-435f-8007-817732bfc0a5", + "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ] +} \ No newline at end of file diff --git a/manual_examine.py b/manual_examine.py new file mode 100644 index 0000000..ddd69b7 --- /dev/null +++ b/manual_examine.py @@ -0,0 +1,309 @@ +from __future__ import annotations +import argparse +import datetime +import json +import logging +import os +import sys +import signal +import time +from typing import List, Dict +from tqdm import tqdm +from desktop_env.desktop_env import DesktopEnv + +# Global variables for signal handling +active_environment = None +is_terminating = False + +# load the environment variables from .env file +if os.path.exists(".env"): + from dotenv import load_dotenv + load_dotenv() + +# Logger Configs {{{ # +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Manual examination of benchmark tasks" + ) + + # environment config + parser.add_argument("--path_to_vm", type=str, default=None) + parser.add_argument( + "--headless", action="store_true", help="Run in headless machine" + ) + parser.add_argument( + "--action_space", type=str, default="pyautogui", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], + default="screenshot", + help="Observation type", + ) + parser.add_argument("--screen_width", type=int, default=1920) + parser.add_argument("--screen_height", type=int, default=1080) + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + parser.add_argument("--max_steps", type=int, default=15) + + # agent config + parser.add_argument("--max_trajectory_length", type=int, default=3) + parser.add_argument( + "--test_config_base_dir", type=str, default="evaluation_examples" + ) + + # example config + parser.add_argument("--domain", type=str, required=True, help="Specific domain to examine") + parser.add_argument("--example_id", type=str, required=True, help="Specific example ID to examine") + parser.add_argument( + "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" + ) + + # logging related + parser.add_argument("--result_dir", type=str, default="./results_manual") + parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help="Set the logging level") + + # aws config + parser.add_argument( + "--region", type=str, default="us-east-1", help="AWS region for the VM" + ) + args = parser.parse_args() + return args + +args = config() # Get command line arguments first + +logger = logging.getLogger() +log_level = getattr(logging, args.log_level.upper()) +logger.setLevel(log_level) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler( + os.path.join("logs", "manual-{:}.log".format(datetime_str)), encoding="utf-8" +) +debug_handler = logging.FileHandler( + os.path.join("logs", "manual-debug-{:}.log".format(datetime_str)), encoding="utf-8" +) +stdout_handler = logging.StreamHandler(sys.stdout) + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(log_level) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" +) +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + + +def setup_example_logger(example, example_result_dir): + """设置特定样例的日志记录器""" + runtime_logger = logging.getLogger(f"desktopenv.example.{example['id']}") + runtime_logger.setLevel(logging.DEBUG) + runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log"))) + return runtime_logger + + +def run_manual_examination(env, example, instruction, args, example_result_dir): + """手动检查单个样例的函数""" + runtime_logger = setup_example_logger(example, example_result_dir) + + # 重置环境并加载任务配置 + env.reset(task_config=example) + logger.info("环境正在初始化,请等待60秒...") + time.sleep(60) # Wait for the environment to be ready + + # 获取初始观察 + obs = env._get_obs() + + # 保存初始状态截图 + initial_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + with open(os.path.join(example_result_dir, f"initial_state_{initial_timestamp}.png"), "wb") as f: + f.write(obs['screenshot']) + + # 记录任务信息 + with open(os.path.join(example_result_dir, "task_info.json"), "w", encoding="utf-8") as f: + json.dump({ + "domain": args.domain, + "example_id": args.example_id, + "instruction": instruction, + "initial_timestamp": initial_timestamp, + "example_config": example + }, f, indent=2, ensure_ascii=False) + + # 开始录制 + env.controller.start_recording() + + logger.info("="*80) + logger.info(f"任务域: {args.domain}") + logger.info(f"样例ID: {args.example_id}") + logger.info(f"任务指令: {instruction}") + logger.info("="*80) + logger.info("环境已准备就绪!") + logger.info("请在虚拟机中手动执行任务...") + logger.info("完成后请按回车键继续进行评估...") + logger.info("="*80) + + # 阻塞等待用户手动操作 + try: + input("按回车键开始评估...") + except KeyboardInterrupt: + logger.info("用户中断操作") + return None + + logger.info("开始评估...") + + # 获取最终状态截图 + final_obs = env._get_obs() + final_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + with open(os.path.join(example_result_dir, f"final_state_{final_timestamp}.png"), "wb") as f: + f.write(final_obs['screenshot']) + + # 评估结果 + result = env.evaluate() + logger.info(f"评估结果: {result:.2f}") + + # 保存结果 + with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f: + f.write(f"{result}\n") + + # 保存执行记录 + with open(os.path.join(example_result_dir, "execution_log.jsonl"), "w", encoding="utf-8") as f: + f.write(json.dumps({ + "type": "manual_execution", + "initial_timestamp": initial_timestamp, + "final_timestamp": final_timestamp, + "result": result, + "initial_screenshot": f"initial_state_{initial_timestamp}.png", + "final_screenshot": f"final_state_{final_timestamp}.png" + }, ensure_ascii=False)) + f.write("\n") + + # 结束录制 + env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) + + return result + + +def signal_handler(signum, frame): + """处理终止信号以优雅关闭环境""" + global is_terminating, active_environment + + # 避免重复处理 + if is_terminating: + return + + is_terminating = True + logger.info(f"接收到信号 {signum}。正在优雅关闭...") + + # 关闭环境 + if active_environment: + try: + logger.info("正在关闭环境...") + active_environment.close() + logger.info("环境已成功关闭") + except Exception as e: + logger.error(f"关闭环境时出错: {e}") + + logger.info("关闭完成。退出程序。") + sys.exit(0) + + +def main(): + global active_environment + + # 注册信号处理器以优雅终止 + signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C + signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal + + try: + args = config() + logger.info("参数: %s", args) + + # 加载指定的任务 + config_file = os.path.join( + args.test_config_base_dir, f"examples/{args.domain}/{args.example_id}.json" + ) + + if not os.path.exists(config_file): + logger.error(f"配置文件不存在: {config_file}") + return + + with open(config_file, "r", encoding="utf-8") as f: + example = json.load(f) + + # 创建结果目录 + example_result_dir = os.path.join( + args.result_dir, + args.action_space, + args.observation_type, + "manual_examination", + args.domain, + args.example_id, + ) + os.makedirs(example_result_dir, exist_ok=True) + + # 设置环境 + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = "us-east-1" + active_environment = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=args.action_space, + provider_name="aws", + region=REGION, + snapshot_name=IMAGE_ID_MAP[REGION], + screen_size=(args.screen_width, args.screen_height), + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True + ) + + # 执行手动检查 + result = run_manual_examination( + active_environment, + example, + example["instruction"], + args, + example_result_dir + ) + + if result is not None: + logger.info(f"手动检查完成。最终结果: {result:.2f}") + else: + logger.info("手动检查被中断") + + except KeyboardInterrupt: + logger.info("主进程接收到KeyboardInterrupt") + # 信号处理器会处理清理工作 + except Exception as e: + logger.error(f"主进程中的意外错误: {e}", exc_info=True) + # 也触发清理 + signal_handler(signal.SIGTERM, None) + finally: + # 最终清理以防任何环境或进程仍然存在 + logger.info("主进程最终清理...") + if active_environment is not None: + try: + logger.info("在最终清理中关闭环境...") + active_environment.close() + logger.info("在最终清理中环境已成功关闭") + except Exception as e: + logger.error(f"最终环境清理期间出错: {e}") + + +if __name__ == "__main__": + # 禁用tokenizers并行处理避免警告 + os.environ["TOKENIZERS_PARALLELISM"] = "false" + main() \ No newline at end of file diff --git a/monitor/.env b/monitor/.env index 05618af..62ba076 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_all.json +TASK_CONFIG_PATH=../evaluation_examples/test_fix_chrome.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_all +RESULTS_BASE_PATH=../results_chrome_operator ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview diff --git a/run_human_examine.sh b/run_human_examine.sh new file mode 100644 index 0000000..c8e8447 --- /dev/null +++ b/run_human_examine.sh @@ -0,0 +1,9 @@ +python manual_examine.py \ + --headless \ + --observation_type screenshot \ + --result_dir ./results_human_examine_chrome_fix_1 \ + --test_all_meta_path evaluation_examples/test_fix_chrome.json \ + --region us-east-1 \ + --domain chrome \ + --example_id 030eeff7-b492-4218-b312-701ec99ee0cc \ + --max_steps 3 \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh new file mode 100644 index 0000000..da52d1a --- /dev/null +++ b/run_operator.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_chrome_operator \ +--test_all_meta_path evaluation_examples/test_fix_chrome.json \ +--region us-east-1 \ +--max_steps 150 \ +--num_envs 10 \ No newline at end of file