From 5d90faa548e88ce8883f0fa1ed3db4093952ebce Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Mon, 14 Jul 2025 07:13:17 +0000 Subject: [PATCH 1/7] run operagor --- .../chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json | 2 +- .../chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json | 2 +- evaluation_examples/settings/proxy/dataimpulse.json | 4 ++-- monitor/.env | 4 ++-- run_operator.sh | 9 +++++++++ show_result.py | 2 +- 6 files changed, 16 insertions(+), 7 deletions(-) create mode 100644 run_operator.sh diff --git a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json index 9b37187..a93c959 100644 --- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json +++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json @@ -57,5 +57,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json index 9e5d730..6bdffe9 100644 --- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json +++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json @@ -56,5 +56,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/settings/proxy/dataimpulse.json b/evaluation_examples/settings/proxy/dataimpulse.json index 4cd99ac..3c552a5 100644 --- a/evaluation_examples/settings/proxy/dataimpulse.json +++ b/evaluation_examples/settings/proxy/dataimpulse.json @@ -2,8 +2,8 @@ { "host": "gw.dataimpulse.com", "port": 823, - "username": "your_username", - "password": "your_password", + "username": "e750e5abb74376d28361", + "password": "e5ec245537e1e76a", "protocol": "http", "provider": "dataimpulse", "type": "residential", diff --git a/monitor/.env b/monitor/.env index 05618af..26de7b2 100644 --- a/monitor/.env +++ b/monitor/.env @@ -4,11 +4,11 @@ # Monitor configuration TASK_CONFIG_PATH=../evaluation_examples/test_all.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_all +RESULTS_BASE_PATH=../results_operator_full_test_0713 ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview -MAX_STEPS=150 +MAX_STEPS=100 FLASK_PORT=80 FLASK_HOST=0.0.0.0 FLASK_DEBUG=true \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh new file mode 100644 index 0000000..154df38 --- /dev/null +++ b/run_operator.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_operator_full_test_0713 \ +--test_all_meta_path evaluation_examples/test_all.json \ +--max_steps 100 \ +--num_envs 15 \ +--provider_name aws \ No newline at end of file diff --git a/show_result.py b/show_result.py index c6bbbc5..623833d 100644 --- a/show_result.py +++ b/show_result.py @@ -68,4 +68,4 @@ def get_result(action_space, use_model, observation_type, result_dir): if __name__ == '__main__': - get_result("pyautogui", "gpt-4o", "a11y_tree", "./results") + get_result("pyautogui", "computer-use-preview", "screenshot", "./results_operator_full_test_0713") From 08b4cf2c2fa671de88fc8537491b813daa26eeb4 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Tue, 15 Jul 2025 02:09:40 +0000 Subject: [PATCH 2/7] fix infeasible&chome tasks --- .../47543840-672a-467d-80df-8f7c3b9788c9.json | 2 +- .../b4f95342-463e-4179-8c3f-193cd7241fb2.json | 2 +- .../b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json | 16 ++++++++-------- .../fc6d8143-9452-4171-9459-7f515143419a.json | 2 +- mm_agents/openai_cua_agent.py | 9 ++++++++- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json index c74fdcf..4829d2d 100644 --- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json +++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json @@ -1,7 +1,7 @@ { "id": "47543840-672a-467d-80df-8f7c3b9788c9", "snapshot": "chrome", - "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", + "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index 7773484..e6fe04f 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "List as many of the next available dates for Diamond Campground as possible.", + "instruction": "Find the Next Available dates for Diamond.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json index e84af23..48bf735 100644 --- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json +++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json @@ -66,10 +66,10 @@ "goto_prefix": "https://www.", "category": "xpath", "xpathObject": { - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to", - "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to", + "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult", "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank" } } @@ -101,10 +101,10 @@ }, "timezone": "America/New_York", "expected": { - "from": "{DoW}, {Month} {Day0D}", - "to": "{DoW}, {Month} {Day0D}", + "from": "Check In{DoW}, {Month} {Day0D}", + "to": "Check Out{DoW}, {Month} {Day0D}", "city": "New York City Hotels", - "adult": "2 guests", + "adult": "Rooms/Guests1 Room, 2 Guests", "rank": "Price (low to high)" } } @@ -112,5 +112,5 @@ ] }, "proxy": true, - "possibility_of_env_change": "medium" + "possibility_of_env_change": "high" } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json index 7fea695..5844e21 100644 --- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json +++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json @@ -1,7 +1,7 @@ { "id": "fc6d8143-9452-4171-9459-7f515143419a", "snapshot": "chrome", - "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.", + "instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.", "source": "test_task_0", "config": [ { diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index f653a62..315432e 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -671,8 +671,14 @@ class OpenAICUAAgent: action_exit = False thought_exit = False message_exit = False + infeasible_message = False + infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"] for item in response.output: parsed_item = self._handle_item(item) + if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list): + actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""}) + infeasible_message = True + break if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui": actions.append(parsed_item) else: @@ -693,7 +699,7 @@ class OpenAICUAAgent: # state_correct = True # if action_exit and not message_exit: # state_correct = True - if action_exit: + if action_exit and not infeasible_message: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit) @@ -747,6 +753,7 @@ class OpenAICUAAgent: # Convert the action to an Action object step_action = Action(action.get("action", ""), self.action_space) # Execute the action in the environment + print(f"Executing action: {step_action.get_action()}") obs, reward, terminated, info = self.env.step(step_action.get_action()) screenshot_base64 = encode_image(obs["screenshot"]) From 68a9f647f48688a86752493a1904bb633e079254 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Tue, 15 Jul 2025 04:17:34 +0000 Subject: [PATCH 3/7] fix: address https://github.com/xlang-ai/OSWorld/issues/257 by implement fix for PyAutoGUI '<' character bug in command execution. Introduced a new function to handle typewrite and press calls, ensuring correct behavior when using '<' in commands. Updated command execution logic to apply this fix before executing user commands. --- desktop_env/desktop_env.py | 91 +++++++++++++++++++++++++++++++++++++- 1 file changed, 89 insertions(+), 2 deletions(-) diff --git a/desktop_env/desktop_env.py b/desktop_env/desktop_env.py index 833e54e..488d4ef 100644 --- a/desktop_env/desktop_env.py +++ b/desktop_env/desktop_env.py @@ -3,6 +3,7 @@ from __future__ import annotations import logging import os import time +import re from typing import Callable, Any, Optional, Tuple from typing import List, Dict, Union @@ -22,6 +23,88 @@ MAX_RETRIES = 5 # Maximum retries for environment setup +def _fix_pyautogui_less_than_bug(command: str) -> str: + """ + Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls. + + This fixes the known PyAutoGUI issue where typing '<' produces '>' instead. + References: + - https://github.com/asweigart/pyautogui/issues/198 + - https://github.com/xlang-ai/OSWorld/issues/257 + + Args: + command (str): The original pyautogui command + + Returns: + str: The fixed command with '<' characters handled properly + """ + # Handle typewrite with '<' characters + def replace_typewrite_less_than(match): + content = match.group(1) + # Split the content by '<' and rebuild with hotkey calls + parts = content.split('<') + if len(parts) == 1: + # No '<' found, return original + return match.group(0) + + # Rebuild the command + result_parts = [] + for i, part in enumerate(parts): + if i == 0: + # First part, just add typewrite if not empty + if part: + result_parts.append(f"pyautogui.typewrite({repr(part)})") + else: + # Add hotkey for '<' and then typewrite for the rest if not empty + result_parts.append('pyautogui.hotkey("shift", ",")') + if part: + result_parts.append(f"pyautogui.typewrite({repr(part)})") + + return '; '.join(result_parts) + + # Handle press('<') calls + def replace_press_less_than(match): + return 'pyautogui.hotkey("shift", ",")' + + # Pattern to match typewrite calls with quoted strings + typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)' + # Pattern to match press('<') calls + press_pattern = r'pyautogui\.press\(["\']<["\']\)' + + # First handle press('<') calls + command = re.sub(press_pattern, replace_press_less_than, command) + + # Then handle typewrite calls + def process_typewrite_match(match): + quote_char = match.group(1) + content = match.group(2) + + # Check if content contains '<' + if '<' not in content: + return match.group(0) + + # Split by '<' and rebuild + parts = content.split('<') + result_parts = [] + + for i, part in enumerate(parts): + if i == 0: + # First part + if part: + result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") + else: + # Add hotkey for '<' and then typewrite for the rest + result_parts.append('pyautogui.hotkey("shift", ",")') + if part: + result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})") + + return '; '.join(result_parts) + + command = re.sub(typewrite_pattern, process_typewrite_match, command) + + return command + + class DesktopEnv(gym.Env): """ DesktopEnv with OpenAI Gym interface. It provides a desktop environment for setting and evaluating desktop automation tasks. @@ -341,9 +424,13 @@ class DesktopEnv(gym.Env): else: # the set of all possible python commands insides `pyautogui` if type(action) == str: - self.controller.execute_python_command(action) + # Fix PyAutoGUI '<' character bug before execution + fixed_command = _fix_pyautogui_less_than_bug(action) + self.controller.execute_python_command(fixed_command) elif type(action) == dict: - self.controller.execute_python_command(action['command']) + # Fix PyAutoGUI '<' character bug before execution + fixed_command = _fix_pyautogui_less_than_bug(action['command']) + self.controller.execute_python_command(fixed_command) time.sleep(pause) observation = self._get_obs() From af47ed8fb1d505109431d4c57513304027609063 Mon Sep 17 00:00:00 2001 From: Yuan Mengqi <100453613+yuanmengqi@users.noreply.github.com> Date: Tue, 15 Jul 2025 13:02:42 +0800 Subject: [PATCH 4/7] fix infeasible&chrome tasks (#258) * fix chrome * fix: fix proxy setup * feat&fix: add proxy support in setup and remove hardcoded proxy from example * fix tasks * fix chrome finished * fix * clean chrome_fix code * clean chrome_fix code * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix multiapps * fix chrome 2888b4e6-5b47-4b57-8bf5-c73827890774 * fix some multi_apps tasks * fix some multi_apps tasks * fix password&resolution * fix password&resolution * Improve code logic for password & resolution * edit * Merge branch 'main' into fix_chrome * fix chrome tasks * Merge branch 'fix_chrome' * fix insensible&chrome tasks --------- Co-authored-by: adlsdztony --- .../47543840-672a-467d-80df-8f7c3b9788c9.json | 2 +- .../9f935cce-0a9f-435f-8007-817732bfc0a5.json | 2 +- .../a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json | 2 +- .../b4f95342-463e-4179-8c3f-193cd7241fb2.json | 2 +- .../b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json | 16 ++++++++-------- .../fc6d8143-9452-4171-9459-7f515143419a.json | 2 +- mm_agents/openai_cua_agent.py | 8 +++++++- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json index c74fdcf..4829d2d 100644 --- a/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json +++ b/evaluation_examples/examples/chrome/47543840-672a-467d-80df-8f7c3b9788c9.json @@ -1,7 +1,7 @@ { "id": "47543840-672a-467d-80df-8f7c3b9788c9", "snapshot": "chrome", - "instruction": "Show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", + "instruction": "On the current website, show me the cars available for pickup at Boston Logan Intl Airport from the 10th to the 11th of next month, sorted by the number of seats to find the largest capacity.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json index 9b37187..a93c959 100644 --- a/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json +++ b/evaluation_examples/examples/chrome/9f935cce-0a9f-435f-8007-817732bfc0a5.json @@ -57,5 +57,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json index 9e5d730..6bdffe9 100644 --- a/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json +++ b/evaluation_examples/examples/chrome/a728a36e-8bf1-4bb6-9a03-ef039a5233f0.json @@ -56,5 +56,5 @@ } } }, - "proxy": true + "proxy": false } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json index 7773484..e6fe04f 100644 --- a/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json +++ b/evaluation_examples/examples/chrome/b4f95342-463e-4179-8c3f-193cd7241fb2.json @@ -1,7 +1,7 @@ { "id": "b4f95342-463e-4179-8c3f-193cd7241fb2", "snapshot": "chrome", - "instruction": "List as many of the next available dates for Diamond Campground as possible.", + "instruction": "Find the Next Available dates for Diamond.", "source": "test_task_1", "config": [ { diff --git a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json index e84af23..48bf735 100644 --- a/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json +++ b/evaluation_examples/examples/chrome/b7895e80-f4d1-4648-bee0-4eb45a6f1fa8.json @@ -66,10 +66,10 @@ "goto_prefix": "https://www.", "category": "xpath", "xpathObject": { - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/div/button/div[3]": "from", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/div[3]": "to", - "/html/body/div[1]/main/div[3]/div[2]/div/div[1]/div/h2": "city", - "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/div[3]/span/span[2]": "adult", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[1]/button/span/div/div": "from", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[2]/button/span/div/div": "to", + "/html/body/div[1]/main/div[3]/div[2]/div/div/div/h2": "city", + "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[1]/div/div/div/div[3]/button/span/div/div": "adult", "/html/body/div[1]/main/div[3]/div[5]/div[2]/div/div[3]/div/div[2]/div/div/div[2]/div/button/div/div": "rank" } } @@ -101,10 +101,10 @@ }, "timezone": "America/New_York", "expected": { - "from": "{DoW}, {Month} {Day0D}", - "to": "{DoW}, {Month} {Day0D}", + "from": "Check In{DoW}, {Month} {Day0D}", + "to": "Check Out{DoW}, {Month} {Day0D}", "city": "New York City Hotels", - "adult": "2 guests", + "adult": "Rooms/Guests1 Room, 2 Guests", "rank": "Price (low to high)" } } @@ -112,5 +112,5 @@ ] }, "proxy": true, - "possibility_of_env_change": "medium" + "possibility_of_env_change": "high" } \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json index 7fea695..5844e21 100644 --- a/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json +++ b/evaluation_examples/examples/chrome/fc6d8143-9452-4171-9459-7f515143419a.json @@ -1,7 +1,7 @@ { "id": "fc6d8143-9452-4171-9459-7f515143419a", "snapshot": "chrome", - "instruction": "Find the status of tomorrow flights from New York-Kennedy airport to Chicago-O'Hare airport.", + "instruction": "Find flights from New York–Kennedy Airport to Chicago O'Hare Airport for tomorrow.", "source": "test_task_0", "config": [ { diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index f653a62..e615308 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -671,8 +671,14 @@ class OpenAICUAAgent: action_exit = False thought_exit = False message_exit = False + infeasible_message = False + infeasible_word_list = ["infeasible", "unfeasible", "impossible", "not feasible", "cannot be done"] for item in response.output: parsed_item = self._handle_item(item) + if item.type == "message" and any(word in parsed_item.lower() for word in infeasible_word_list): + actions.append({"action_space": "pyautogui", "action": "FAIL", "pending_checks": [], "call_id": ""}) + infeasible_message = True + break if isinstance(parsed_item, dict) and parsed_item.get("action_space", None) == "pyautogui": actions.append(parsed_item) else: @@ -693,7 +699,7 @@ class OpenAICUAAgent: # state_correct = True # if action_exit and not message_exit: # state_correct = True - if action_exit: + if action_exit and not infeasible_message: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit) From 451bbf5fc2d75e555ca28109f3c1f04fd6a0f18a Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Tue, 15 Jul 2025 07:24:33 +0000 Subject: [PATCH 5/7] Update multi_apps JSON examples: refined instructions for image processing in GIMP, replaced an open command with a launch command for VLC, and corrected assignment modification instruction in LibreOffice Calc example. --- .../3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json | 2 +- .../68a25bd4-59c7-4f4d-975e-da0c8509c848.json | 10 ++-------- .../778efd0a-153f-4842-9214-f05fc176b877.json | 2 +- .../bc2b57f3-686d-4ec9-87ce-edf850b7e442.json | 2 +- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json index 060aaac..a66980b 100644 --- a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json +++ b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json @@ -1,7 +1,7 @@ { "id": "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", "snapshot": "gimp", - "instruction": "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f/kingbird.jpeg", + "instruction": "Download the image from \"https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f/kingbird.jpeg\", and then use GIMP to compress it to under 600KB as \"compressed.jpeg\" on the Desktop. Resize if needed.", "source": "", "config": [ { diff --git a/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json b/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json index 349848d..7ec1346 100644 --- a/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json +++ b/evaluation_examples/examples/multi_apps/68a25bd4-59c7-4f4d-975e-da0c8509c848.json @@ -15,12 +15,6 @@ ] } }, - { - "type": "open", - "parameters": { - "path": "/home/user/Desktop/rsc-ebook-collection-2023.xlsx" - } - }, { "type": "launch", "parameters": { @@ -41,9 +35,9 @@ } }, { - "type": "activate_window", + "type": "open", "parameters": { - "window_name": "Google Chrome" + "path": "/home/user/Desktop/rsc-ebook-collection-2023.xlsx" } } ], diff --git a/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json b/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json index b74ed3d..063397a 100644 --- a/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json +++ b/evaluation_examples/examples/multi_apps/778efd0a-153f-4842-9214-f05fc176b877.json @@ -28,7 +28,7 @@ { "type": "launch", "parameters": { - "command": "vlc", + "command": "VLC_VERBOSE=-1 vlc --no-audio --no-video-title-show /home/user/Desktop/planet.mp4", "shell": true } } diff --git a/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json b/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json index 654b704..f4fc314 100644 --- a/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json +++ b/evaluation_examples/examples/multi_apps/bc2b57f3-686d-4ec9-87ce-edf850b7e442.json @@ -1,7 +1,7 @@ { "id": "bc2b57f3-686d-4ec9-87ce-edf850b7e442", "snapshot": "libreoffice_calc", - "instruction": "The requirements of my data analysis assignment are listed in \"reminder.docx\" on the desktop. Help me modify my assignment \"asm.xlsx\" saved on the desktop accordingly.", + "instruction": "The requirements of my data analysis assignment are listed in \"reminder.docx\" on the desktop. Help me modify my assignment opended accordingly.", "source": "authors", "config": [ { From cb070307eeaa887304f60d6242f0da2dbff5d79b Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Tue, 15 Jul 2025 14:57:14 +0000 Subject: [PATCH 6/7] merge code --- .../settings/proxy/dataimpulse.json | 4 +- evaluation_examples/test.json | 12 + evaluation_examples/test_all_nogdrive.json | 383 ++++++++++++++++++ monitor/.env | 6 +- run_operator_fix.sh | 9 + 5 files changed, 409 insertions(+), 5 deletions(-) create mode 100644 evaluation_examples/test.json create mode 100644 evaluation_examples/test_all_nogdrive.json create mode 100644 run_operator_fix.sh diff --git a/evaluation_examples/settings/proxy/dataimpulse.json b/evaluation_examples/settings/proxy/dataimpulse.json index 3c552a5..2f08efe 100644 --- a/evaluation_examples/settings/proxy/dataimpulse.json +++ b/evaluation_examples/settings/proxy/dataimpulse.json @@ -2,8 +2,8 @@ { "host": "gw.dataimpulse.com", "port": 823, - "username": "e750e5abb74376d28361", - "password": "e5ec245537e1e76a", + "username": "fba5ac061fe18be70c6c", + "password": "e225c50bf56bdd6c", "protocol": "http", "provider": "dataimpulse", "type": "residential", diff --git a/evaluation_examples/test.json b/evaluation_examples/test.json new file mode 100644 index 0000000..92a9f36 --- /dev/null +++ b/evaluation_examples/test.json @@ -0,0 +1,12 @@ +{ + "multi_apps": [ + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "22a4636f-8179-4357-8e87-d1743ece1f81", + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "78aed49a-a710-4321-a793-b611a7c5b56b", + "0c825995-5b70-4526-b663-113f4c999dd2", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_all_nogdrive.json b/evaluation_examples/test_all_nogdrive.json new file mode 100644 index 0000000..1d06660 --- /dev/null +++ b/evaluation_examples/test_all_nogdrive.json @@ -0,0 +1,383 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", + "06fe7178-4491-4589-810f-2e2bc9502122", + "e1e75309-3ddb-4d09-92ec-de869c928143", + "35253b65-1c19-4304-8aa4-6884b8218fc0", + "2ad9387a-65d8-4e33-ad5b-7580065a27ca", + "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3", + "480bcfea-d68f-4aaa-a0a9-2589ef319381", + "af630914-714e-4a24-a7bb-f9af687d3b91", + "3720f614-37fd-4d04-8a6b-76f54f8c222d", + "99146c54-4f37-4ab8-9327-5f3291665e1e", + "12086550-11c0-466b-b367-1d9e75b3910e", + "6766f2b8-8a72-417f-a9e5-56fcaa735837", + "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9", + "ae78f875-5b98-4907-bbb5-9c737fc68c03", + "3299584d-8f11-4457-bf4c-ce98f7600250", + "030eeff7-b492-4218-b312-701ec99ee0cc", + "9656a811-9b5b-4ddf-99c7-5117bcef0626", + "fc6d8143-9452-4171-9459-7f515143419a", + "a96b564e-dbe9-42c3-9ccf-b4498073438a", + "1704f00f-79e6-43a7-961b-cedd3724d5fd", + "f3b19d1e-2d48-44e9-b4e1-defcae1a0197", + "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a", + "47543840-672a-467d-80df-8f7c3b9788c9", + "c1fa57f3-c3db-4596-8f09-020701085416", + "da46d875-6b82-4681-9284-653b0c7ae241", + "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc", + "f79439ad-3ee8-4f99-a518-0eb60e5652b0", + "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8", + "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805", + "7f52cab9-535c-4835-ac8c-391ee64dc930", + "82279c77-8fc6-46f6-9622-3ba96f61b477", + "2888b4e6-5b47-4b57-8bf5-c73827890774", + "b4f95342-463e-4179-8c3f-193cd7241fb2", + "f5d96daf-83a8-4c86-9686-bada31fc66ab", + "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba", + "368d9ba4-203c-40c1-9fa3-da2f1430ce63", + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "a728a36e-8bf1-4bb6-9a03-ef039a5233f0", + "b070486d-e161-459b-aa2b-ef442d973b92", + "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", + "9f935cce-0a9f-435f-8007-817732bfc0a5", + "f0b971a1-6831-4b9b-a50e-22a6e47f45ba", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "gimp": [ + "7a4deb26-d57d-4ea9-9a73-630f66a7b568", + "554785e9-4523-4e7a-b8e1-8016f565f56a", + "77b8ab4d-994f-43ac-8930-8ca087d7c4b4", + "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce", + "d52d6308-ec58-42b7-a2c9-de80e4837b2b", + "2a729ded-3296-423d-aec4-7dd55ed5fbb3", + "b148e375-fe0b-4bec-90e7-38632b0d73c2", + "a746add2-cab0-4740-ac36-c3769d9bfb46", + "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d", + "d16c99dc-2a1e-46f2-b350-d97c86c85c15", + "06ca5602-62ca-47f6-ad4f-da151cde54cc", + "e2dd0213-26db-4349-abe5-d5667bfd725c", + "f723c744-e62c-4ae6-98d1-750d3cd7d79d", + "72f83cdc-bf76-4531-9a1b-eb893a13f8aa", + "7767eef2-56a3-4cea-8c9f-48c070c7d65b", + "734d6579-c07d-47a8-9ae2-13339795476b", + "e19bd559-633b-4b02-940f-d946248f088e", + "38f48d40-764e-4e77-a7cf-51dfce880291", + "fbb548ca-c2a6-4601-9204-e39a2efc507b", + "5ca86c6f-f317-49d8-b6a7-b527541caae8", + "62f7fd55-0687-4a43-b6e1-3eda16fc6252", + "8ea73f6f-9689-42ad-8c60-195bbf06a7ba", + "58d3eeeb-e9d0-499f-962e-fd0db2a744d8", + "2e6f678f-472d-4c55-99cc-8e7c5c402a71", + "045bf3ff-9077-4b86-b483-a1040a949cff", + "dbbf4b99-2253-4b10-9274-45f246af2466" + ], + "libreoffice_calc": [ + "357ef137-7eeb-4c80-a3bb-0951f26a8aff", + "42e0a640-4f19-4b28-973d-729602b5a4a7", + "51719eea-10bc-4246-a428-ac7c433dd4b3", + "1954cced-e748-45c4-9c26-9855b97fbc5e", + "2bd59342-0664-4ccb-ba87-79379096cc08", + "3aaa4e37-dc91-482e-99af-132a612d40f3", + "1273e544-688f-496b-8d89-3e0f40aa0606", + "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371", + "f9584479-3d0d-4c79-affa-9ad7afdd8850", + "535364ea-05bd-46ea-9937-9f55c68507e8", + "7e429b8d-a3f0-4ed0-9b58-08957d00b127", + "4f07fbe9-70de-4927-a4d5-bb28bc12c52c", + "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f", + "0bf05a7d-b28b-44d2-955a-50b41e24012a", + "6054afcb-5bab-4702-90a0-b259b5d3217c", + "abed40dc-063f-4598-8ba5-9fe749c0615d", + "37608790-6147-45d0-9f20-1137bb35703d", + "26a8440e-c166-4c50-aef4-bfb77314b46b", + "d681960f-7bc3-4286-9913-a8812ba3261a", + "035f41ba-6653-43ab-aa63-c86d449d62e5", + "7efeb4b1-3d19-4762-b163-63328d66303b", + "1de60575-bb6e-4c3d-9e6a-2fa699f9f197", + "aa3a8974-2e85-438b-b29e-a64df44deb4b", + "51b11269-2ca8-4b2a-9163-f21758420e78", + "1e8df695-bd1b-45b3-b557-e7d599cf7597", + "ecb0df7a-4e8d-4a03-b162-053391d3afaf", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + "a01fbce3-2793-461f-ab86-43680ccbae25", + "0326d92d-d218-48a8-9ca1-981cd6d064c7", + "0a2e43bf-b26c-4631-a966-af9dfa12c9e5", + "4188d3a4-077d-46b7-9c86-23e1a036f6c1", + "347ef137-7eeb-4c80-a3bb-0951f26a8aff", + "eb03d19a-b88d-4de4-8a64-ca0ac66f426b", + "0cecd4f3-74de-457b-ba94-29ad6b5dafb6", + "1d17d234-e39d-4ed7-b46f-4417922a4e7c", + "4e6fcf72-daf3-439f-a232-c434ce416af6", + "01b269ae-2111-4a07-81fd-3fcd711993b0", + "21df9241-f8d7-4509-b7f1-37e501a823f7", + "a9f325aa-8c05-4e4f-8341-9e4358565f4f", + "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5", + "7a4e4bc8-922c-4c84-865c-25ba34136be1", + "4de54231-e4b5-49e3-b2ba-61a0bec721c0", + "30e3e107-1cfb-46ee-a755-2cd080d7ba6a", + "4172ea6e-6b77-4edb-a9cc-c0014bd1603b", + "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17", + "3a7c8185-25c1-4941-bd7b-96e823c9f21f", + "21ab7b40-77c2-4ae6-8321-e00d3a086c73" + ], + "libreoffice_impress": [ + "5d901039-a89c-4bfb-967b-bf66f4df075e", + "550ce7e7-747b-495f-b122-acdc4d0b8e54", + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "af23762e-2bfd-4a1d-aada-20fa8de9ce07", + "c59742c0-4323-4b9d-8a02-723c251deaa0", + "ef9d12bd-bcee-4ba0-a40e-918400f43ddf", + "9ec204e4-f0a3-42f8-8458-b772a6797cab", + "0f84bef9-9790-432e-92b7-eece357603fb", + "ce88f674-ab7a-43da-9201-468d38539e4a", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "a097acff-6266-4291-9fbd-137af7ecd439", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "21760ecb-8f62-40d2-8d85-0cee5725cb72", + "ac9bb6cb-1888-43ab-81e4-a98a547918cd", + "2cd43775-7085-45d8-89fa-9e35c0a915cf", + "358aa0a7-6677-453f-ae35-e440f004c31e", + "a669ef01-ded5-4099-9ea9-25e99b569840", + "73c99fb9-f828-43ce-b87a-01dc07faa224", + "15aece23-a215-4579-91b4-69eec72e18da", + "986fc832-6af2-417c-8845-9272b3a1528b", + "a434992a-89df-4577-925c-0c58b747f0f4", + "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8", + "841b50aa-df53-47bd-a73a-22d3a9f73160", + "8979838c-54a5-4454-a2b8-3d135a1a5c8f", + "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb", + "2b94c692-6abb-48ae-ab0b-b3e8a19cb340", + "9cf05d24-6bd9-4dae-8967-f67d88f5d38a", + "08aced46-45a2-48d7-993b-ed3fb5b32302", + "edb61b14-a854-4bf5-a075-c8075c11293a", + "c82632a4-56b6-4db4-9dd1-3820ee3388e4", + "39be0d19-634d-4475-8768-09c130f5425d", + "ac1b39ff-ee4d-4483-abce-c117e98942f0", + "f23acfd2-c485-4b7c-a1e7-d4303ddfe864", + "70bca0cc-c117-427e-b0be-4df7299ebeb6", + "af2d657a-e6b3-4c6a-9f67-9e3ed015974c", + "57667013-ea97-417c-9dce-2713091e6e2a", + "0a211154-fda0-48d0-9274-eaac4ce5486d", + "a53f80cd-4a90-4490-8310-097b011433f6", + "7ae48c60-f143-4119-b659-15b8f485eb9a", + "5cfb9197-e72b-454b-900e-c06b0c802b40", + "05dd4c1d-c489-4c85-8389-a7836c4f0567", + "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1", + "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a", + "e4ef0baf-4b52-4590-a47e-d4d464cca2d7", + "ed43c15f-00cb-4054-9c95-62c880865d68", + "3161d64e-3120-47b4-aaad-6a764a92493b", + "04578141-1d42-4146-b9cf-6fab4ce5fd74" + ], + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c", + "0b17a146-2934-46c7-8727-73ff6b6483e8", + "0e47de2a-32e0-456c-a366-8c607ef7a9d2", + "0e763496-b6bb-4508-a427-fad0b6c3e195", + "3ef2b351-8a84-4ff2-8724-d86eae9b842e", + "4bcb1253-a636-4df4-8cb0-a35c04dfef31", + "66399b0d-8fda-4618-95c4-bfc6191617e9", + "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2", + "6ada715d-3aae-4a32-a6a7-429b2e43fb93", + "6f81754e-285d-4ce0-b59e-af7edb02d108", + "72b810ef-4156-4d09-8f08-a0cf57e7cefe", + "8472fece-c7dd-4241-8d65-9b3cd1a0b568", + "88fe4b2d-3040-4c70-9a70-546a47764b48", + "936321ce-5236-426a-9a20-e0e3c5dc536f", + "adf5e2c3-64c7-4644-b7b6-d2f0167927e7", + "b21acd93-60fd-4127-8a43-2f5178f4a830", + "d53ff5ee-3b1a-431e-b2be-30ed2673079b", + "e246f6d8-78d7-44ac-b668-fcf47946cb50", + "e528b65e-1107-4b8c-8988-490e4fece599", + "ecc2413d-8a48-416e-a3a2-d30106ca36cb", + "f178a4a9-d090-4b56-bc4c-4b72a61a035d", + "bb8ccc78-479f-4a2f-a71e-d565e439436b" + ], + "multi_apps": [ + "2b9493d7-49b8-493a-a71b-56cd1f4d6908", + "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5", + "2fe4b718-3bd7-46ec-bdce-b184f5653624", + "3680a5ee-6870-426a-a997-eba929a0d25c", + "510f64c8-9bcc-4be1-8d30-638705850618", + "51f5801c-18b3-4f25-b0c3-02f85507a078", + "58565672-7bfe-48ab-b828-db349231de6b", + "937087b6-f668-4ba6-9110-60682ee33441", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "d9b7c649-c975-4f53-88f5-940b29c47247", + "e135df7c-7687-4ac0-a5f0-76b74438b53e", + "ee9a3c83-f437-4879-8918-be5efbb9fac7", + "f7dfbef3-7697-431c-883a-db8583a4e4f9", + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "6d72aad6-187a-4392-a4c4-ed87269c51cf", + "f918266a-b3e0-4914-865d-4faa564f1aef", + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "bc2b57f3-686d-4ec9-87ce-edf850b7e442", + "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "00fa164e-2612-4439-992e-157d019a8436", + "acb0f96b-e27c-44d8-b55f-7cb76609dfcd", + "69acbb55-d945-4927-a87b-8480e1a5bb7e", + "48d05431-6cd5-4e76-82eb-12b60d823f7d", + "68a25bd4-59c7-4f4d-975e-da0c8509c848", + "eb303e01-261e-4972-8c07-c9b4e7a4922a", + "c7c1e4c3-9e92-4eba-a4b8-689953975ea4", + "d1acdb87-bb67-4f30-84aa-990e56a09c92", + "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", + "8e116af7-7db7-4e35-a68b-b0939c066c78", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "82e3c869-49f6-4305-a7ce-f3e64a0618e7", + "185f29bd-5da0-40a6-b69c-ba7f4e0324ef", + "869de13e-bef9-4b91-ba51-f6708c40b096", + "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e", + "3a93cae4-ad3e-403e-8c12-65303b271818", + "1f18aa87-af6f-41ef-9853-cdb8f32ebdea", + "26150609-0da3-4a7d-8868-0faf9c5f01bb", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "881deb30-9549-4583-a841-8270c65f2a17", + "7e287123-70ca-47b9-8521-47db09b69b14", + "e2392362-125e-4f76-a2ee-524b183a3412", + "5bc63fb9-276a-4439-a7c1-9dc76401737f", + "26660ad1-6ebb-4f59-8cba-a8432dfe8d38", + "a82b78bb-7fde-4cb3-94a4-035baf10bcf0", + "36037439-2044-4b50-b9d1-875b5a332143", + "716a6079-22da-47f1-ba73-c9d58f986a38", + "873cafdd-a581-47f6-8b33-b9696ddb7b05", + "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a", + "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a", + "da922383-bfa4-4cd3-bbad-6bebab3d7742", + "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", + "81c425f5-78f3-4771-afd6-3d2973825947", + "bb83cab4-e5c7-42c7-a67b-e46068032b86", + "227d2f97-562b-4ccb-ae47-a5ec9e142fbb", + "b337d106-053f-4d37-8da0-7f9c4043a66b", + "20236825-b5df-46e7-89bf-62e1d640a897", + "8df7e444-8e06-4f93-8a1a-c5c974269d82", + "aad10cd7-9337-4b62-b704-a857848cedf2", + "02ce9a50-7af2-47ed-8596-af0c230501f8", + "4c26e3f3-3a14-4d86-b44a-d3cedebbb487", + "a503b07f-9119-456b-b75d-f5146737d24f", + "09a37c51-e625-49f4-a514-20a773797a8a", + "3e3fc409-bff3-4905-bf16-c968eee3f807", + "f5c13cdd-205c-4719-a562-348ae5cd1d91", + "5990457f-2adb-467b-a4af-5c857c92d762", + "415ef462-bed3-493a-ac36-ca8c6d23bf1b", + "7ff48d5b-2df2-49da-b500-a5150ffc7f18", + "9f3bb592-209d-43bc-bb47-d77d9df56504", + "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0", + "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac", + "3f05f3b9-29ba-4b6b-95aa-2204697ffc06", + "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56", + "f8369178-fafe-40c2-adc4-b9b08a125456", + "778efd0a-153f-4842-9214-f05fc176b877", + "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5", + "c2751594-0cd5-4088-be1b-b5f2f9ec97c4", + "788b3701-3ec9-4b67-b679-418bfa726c22", + "48c46dc7-fe04-4505-ade7-723cba1aa6f6", + "42d25c08-fb87-4927-8b65-93631280a26f", + "e8172110-ec08-421b-a6f5-842e6451911f", + "42f4d1c7-4521-4161-b646-0a8934e36081", + "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", + "d68204bf-11c1-4b13-b48b-d303c73d4bf6", + "91190194-f406-4cd6-b3f9-c43fac942b22", + "7f35355e-02a6-45b5-b140-f0be698bcf85", + "98e8e339-5f91-4ed2-b2b2-12647cb134f4", + "0e5303d4-8820-42f6-b18d-daf7e633de21", + "df67aebb-fb3a-44fd-b75b-51b6012df509", + "5df7b33a-9f77-4101-823e-02f863e1c1ae", + "aceb0368-56b8-4073-b70e-3dc9aee184e0", + "236833a3-5704-47fc-888c-4f298f09f799", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ], + "os": [ + "94d95f96-9699-4208-98ba-3c3119edf9c2", + "bedcedc4-4d72-425e-ad62-21960b11fe0d", + "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", + "a462a795-fdc7-4b23-b689-e8b6df786b78", + "f9be0997-4b7c-45c5-b05c-4612b44a6118", + "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", + "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", + "e0df059f-28a6-4169-924f-b9623e7184cc", + "b6781586-6346-41cd-935a-a6b1487918fc", + "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa", + "3ce045a0-877b-42aa-8d2c-b4a863336ab8", + "fe41f596-a71b-4c2f-9b2f-9dcd40b568c3", + "a4d98375-215b-4a4d-aee9-3d4370fccc41", + "13584542-872b-42d8-b299-866967b5c3ef", + "23393935-50c7-4a86-aeea-2b78fd089c5c", + "5812b315-e7bd-4265-b51f-863c02174c28", + "c288e301-e626-4b98-a1ab-159dcb162af5", + "4783cc41-c03c-4e1b-89b4-50658f642bd5", + "5c1075ca-bb34-46a3-a7a0-029bd7463e79", + "5ced85fc-fa1a-4217-95fd-0fb530545ce2", + "37887e8c-da15-4192-923c-08fa390a176d", + "4127319a-8b79-4410-b58a-7a151e15f3d7", + "4d117223-a354-47fb-8b45-62ab1390a95f", + "6f56bf42-85b8-4fbb-8e06-6c44960184ba" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e", + "7b1e1ff9-bb85-49be-b01d-d6424be18cd0", + "9bc3cc16-074a-45ac-9bdc-b2a362e1daf3", + "3f28fe4f-5d9d-4994-a456-efd78cfae1a3", + "5203d847-2572-4150-912a-03f062254390", + "dd84e895-72fd-4023-a336-97689ded257c", + "9b7bc335-06b5-4cd3-9119-1a649c478509", + "d38192b0-17dc-4e1d-99c3-786d0117de77", + "a10b69e1-6034-4a2b-93e1-571d45194f75", + "3f49d2cc-f400-4e7d-90cc-9b18e401cc31", + "f201fbc3-44e6-46fc-bcaa-432f9815454c", + "10a730d5-d414-4b40-b479-684bed1ae522", + "a1af9f1c-50d5-4bc3-a51e-4d9b425ff638", + "08c73485-7c6d-4681-999d-919f5c32dcfa" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "fba2c100-79e8-42df-ae74-b592418d54f4", + "efcf0d81-0835-4880-b2fd-d866e8bc2294", + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f", + "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6", + "386dbd0e-0241-4a0a-b6a2-6704fba26b1c", + "9195653c-f4aa-453d-aa95-787f6ccfaae9", + "d06f0d4d-2cd5-4ede-8de9-598629438c6e", + "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81", + "5ac2891a-eacd-4954-b339-98abba077adb", + "f3977615-2b45-4ac5-8bba-80c17dbe2a37", + "215dfd39-f493-4bc3-a027-8a97d72c61bf", + "cb130f0d-d36f-4302-9838-b3baf46139b6", + "7882ed6e-bece-4bf0-bada-c32dc1ddae72" + ], + "vs_code": [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "eabc805a-bfcf-4460-b250-ac92135819f6", + "982d12a5-beab-424f-8d38-d2a48429e511", + "4e60007a-f5be-4bfc-9723-c39affa0a6d3", + "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", + "9439a27b-18ae-42d8-9778-5f68f891805e", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", + "930fdb3b-11a8-46fe-9bac-577332e2640e", + "276cc624-87ea-4f08-ab93-f770e3790175", + "9d425400-e9b2-4424-9a4b-d4c7abac4140", + "5e2d93d8-8ad0-4435-b150-1692aacaa994", + "6ed0a554-cbee-4b44-84ea-fd6c042f4fe1", + "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5", + "70745df8-f2f5-42bd-8074-fbc10334fcc5", + "57242fad-77ca-454f-b71b-f187181a9f23", + "c6bf789c-ba3a-4209-971d-b63abf0ab733", + "0512bb38-d531-4acf-9e7e-0add90816068", + "847a96b6-df94-4927-97e6-8cc9ea66ced7", + "7aeae0e2-70ee-4705-821d-1bba5d5b2ddd", + "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3", + "7c4cc09e-7a92-40dd-8338-b2286535c4ed", + "971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6" + ] +} \ No newline at end of file diff --git a/monitor/.env b/monitor/.env index 2d71a24..78fb5e8 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,13 +2,13 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_all.json +TASK_CONFIG_PATH=../evaluation_examples/test.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_full_test_0713 +RESULTS_BASE_PATH=../results_operator_full_test_0713_gdrive2 ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview MAX_STEPS=100 FLASK_PORT=80 FLASK_HOST=0.0.0.0 -FLASK_DEBUG=false \ No newline at end of file +FLASK_DEBUG=false diff --git a/run_operator_fix.sh b/run_operator_fix.sh new file mode 100644 index 0000000..e666803 --- /dev/null +++ b/run_operator_fix.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_operator_full_test_0713_gdrive2 \ +--test_all_meta_path evaluation_examples/test.json \ +--max_steps 100 \ +--num_envs 10 \ +--provider_name aws \ No newline at end of file From 9eeabfc52d10eea1b41001412cc81c65945d7527 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Thu, 17 Jul 2025 04:14:20 +0000 Subject: [PATCH 7/7] Improve the parallel logic --- mm_agents/openai_cua_agent.py | 2 +- run_multienv_openaicua.py | 197 +++++++------ run_multienv_openaicua_old.py | 533 ++++++++++++++++++++++++++++++++++ 3 files changed, 631 insertions(+), 101 deletions(-) create mode 100644 run_multienv_openaicua_old.py diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 315432e..0afe61e 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -33,7 +33,7 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union -OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - For the thunderbird account \"anonym-x2024@outlook.com\", the password is \"gTCI\";=@y7|QJ0nDa_kN3Sb&>\".\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ +OPERATOR_PROMPT = """\n\n Here are some helpful tips:\n - computer.clipboard, computer.sync_file, computer.sync_shared_folder, computer.computer_output_citation are disabled.\n - If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.\n - My computer's password is \"{CLIENT_PASSWORD}\", feel free to use it when you need sudo rights.\n - If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.\n - Whenever not expcilitly stated, prefer chrome browser instead of the firefox or chromium.\n - You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.\n - You must initialize the computer to solve the task. Do not try to answer the question without initializing the computer.\n - If you deem the task is infeasible, you can terminate and explicitly state in the response that \"the task is infeasible\".\n """ class Action: """Action class for the agent.""" diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index c4eb18c..34db923 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -11,6 +11,7 @@ from typing import List, Dict import math from tqdm import tqdm from multiprocessing import Process, Manager +from multiprocessing import current_process import lib_run_single from desktop_env.desktop_env import DesktopEnv from mm_agents.openai_cua_agent import OpenAICUAAgent @@ -130,32 +131,12 @@ logger.addHandler(stdout_handler) logger = logging.getLogger("desktopenv.experiment") -def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: - """Distribute tasks evenly across environments.""" - # Flatten the tasks into a single list +def distribute_tasks(test_all_meta: dict) -> List[tuple]: all_tasks = [] for domain, examples in test_all_meta.items(): for example_id in examples: all_tasks.append((domain, example_id)) - - # Calculate tasks per environment - tasks_per_env = math.ceil(len(all_tasks) / num_envs) - - # Distribute tasks - distributed_tasks = [] - for i in range(num_envs): - env_tasks = {} - start_idx = i * tasks_per_env - end_idx = min((i + 1) * tasks_per_env, len(all_tasks)) - - for domain, example_id in all_tasks[start_idx:end_idx]: - if domain not in env_tasks: - env_tasks[domain] = [] - env_tasks[domain].append(example_id) - - distributed_tasks.append(env_tasks) - - return distributed_tasks + return all_tasks def process_signal_handler(signum, frame, env_idx): @@ -180,63 +161,58 @@ def process_signal_handler(signum, frame, env_idx): sys.exit(0) -def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, shared_scores: list): - """Run tasks for a single environment.""" - # Each process has its own list of active environments +def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: list): active_environments = [] env = None - - # Setup signal handlers for this process too - signal.signal(signal.SIGINT, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) - signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) - - from desktop_env.providers.aws.manager import IMAGE_ID_MAP - REGION = args.region - screen_size = (args.screen_width, args.screen_height) - ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) - env = DesktopEnv( - path_to_vm=args.path_to_vm, - action_space=args.action_space, - provider_name=args.provider_name, - region=REGION, - snapshot_name=ami_id, - screen_size=screen_size, - headless=args.headless, - os_type="Ubuntu", - require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], - enable_proxy=True, - client_password=args.client_password - ) - active_environments.append(env) - agent = OpenAICUAAgent( - env=env, - model=args.model, - max_tokens=args.max_tokens, - top_p=args.top_p, - temperature=args.temperature, - action_space=args.action_space, - observation_type=args.observation_type, - max_trajectory_length=args.max_trajectory_length, - client_password=args.client_password, - provider_name=args.provider_name, - screen_width=args.screen_width, - screen_height=args.screen_height - ) - logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") - try: - for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"): - for example_id in tqdm(env_tasks[domain], desc="Example", leave=False): + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = args.region + screen_size = (args.screen_width, args.screen_height) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) + env = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=args.action_space, + provider_name=args.provider_name, + region=REGION, + snapshot_name=ami_id, + screen_size=screen_size, + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True, + client_password=args.client_password + ) + active_environments.append(env) + agent = OpenAICUAAgent( + env=env, + model=args.model, + max_tokens=args.max_tokens, + top_p=args.top_p, + temperature=args.temperature, + action_space=args.action_space, + observation_type=args.observation_type, + max_trajectory_length=args.max_trajectory_length, + client_password=args.client_password, + provider_name=args.provider_name, + screen_width=args.screen_width, + screen_height=args.screen_height + ) + logger.info(f"Process {current_process().name} started.") + while True: + try: + item = task_queue.get(timeout=5) + except Exception: + break + domain, example_id = item + try: config_file = os.path.join( args.test_config_base_dir, f"examples/{domain}/{example_id}.json" ) with open(config_file, "r", encoding="utf-8") as f: example = json.load(f) - - logger.info(f"[Env {env_idx+1}][Domain]: {domain}") - logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}") - logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}") - + logger.info(f"[{current_process().name}][Domain]: {domain}") + logger.info(f"[{current_process().name}][Example ID]: {example_id}") + logger.info(f"[{current_process().name}][Instruction]: {example['instruction']}") example_result_dir = os.path.join( args.result_dir, args.action_space, @@ -246,7 +222,6 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share example_id, ) os.makedirs(example_result_dir, exist_ok=True) - try: lib_run_single.run_single_example_openaicua( agent, @@ -260,7 +235,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) except Exception as e: import traceback - logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}") + logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}") logger.error(traceback.format_exc()) try: env.controller.end_recording( @@ -268,7 +243,6 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) except Exception as rec_e: logger.error(f"Failed to end recording: {rec_e}") - with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write( json.dumps( @@ -276,14 +250,22 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share ) ) f.write("\n") + except Exception as e: + logger.error(f"Task-level error in {current_process().name}: {e}") + import traceback + logger.error(traceback.format_exc()) + except Exception as e: + logger.error(f"Process-level error in {current_process().name}: {e}") + import traceback + logger.error(traceback.format_exc()) finally: - # This ensures the environment is closed even if there's an exception - logger.info(f"Process {env_idx + 1} cleaning up environment...") + logger.info(f"{current_process().name} cleaning up environment...") try: - env.close() - logger.info(f"Process {env_idx + 1} environment closed successfully") + if env: + env.close() + logger.info(f"{current_process().name} environment closed successfully") except Exception as e: - logger.error(f"Process {env_idx + 1} error during environment cleanup: {e}") + logger.error(f"{current_process().name} error during environment cleanup: {e}") def signal_handler(signum, frame): @@ -323,8 +305,8 @@ def signal_handler(signum, frame): if p.is_alive(): try: logger.info(f"Forcefully terminating process {p.name}...") - import signal - os.kill(p.pid, signal.SIGKILL) + import signal as sig + os.kill(p.pid, sig.SIGKILL) except Exception as e: logger.error(f"Error forcefully terminating process: {e}") @@ -335,38 +317,56 @@ def signal_handler(signum, frame): def test(args: argparse.Namespace, test_all_meta: dict) -> None: global processes logger.info("Args: %s", args) - - distributed_tasks = distribute_tasks(test_all_meta, args.num_envs) - - logger.info("All environments are ready. Starting parallel task execution...") - - # Create a shared list for scores across processes + all_tasks = distribute_tasks(test_all_meta) + logger.info(f"Total tasks: {len(all_tasks)}") with Manager() as manager: shared_scores = manager.list() - - # Create and start processes for each environment + task_queue = manager.Queue() + for item in all_tasks: + task_queue.put(item) + num_envs = args.num_envs processes = [] - for env_idx, env_tasks in enumerate(distributed_tasks): + for i in range(num_envs): p = Process( target=run_env_tasks, - args=(env_idx, env_tasks, args, shared_scores) + args=(task_queue, args, shared_scores), + name=f"EnvProcess-{i+1}" ) - processes.append(p) + p.daemon = True p.start() + processes.append(p) logger.info(f"Started process {p.name} with PID {p.pid}") - try: - # Wait for all processes to complete + while True: + alive_count = 0 + for idx, p in enumerate(processes): + if not p.is_alive(): + logger.warning(f"Process {p.name} died, restarting...") + new_p = Process( + target=run_env_tasks, + args=(task_queue, args, shared_scores), + name=f"EnvProcess-Restart-{idx+1}" + ) + new_p.daemon = True + new_p.start() + processes[idx] = new_p + logger.info(f"Restarted process {new_p.name} with PID {new_p.pid}") + else: + alive_count += 1 + if task_queue.empty(): + logger.info("All tasks finished.") + break + if alive_count == 0: + logger.error("All processes died, exiting.") + break + time.sleep(5) for p in processes: p.join() - logger.info(f"Process {p.name} completed") except KeyboardInterrupt: logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...") - # Let the signal handler do the cleanup raise except Exception as e: logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True) - # Ensure cleanup happens for p in processes: if p.is_alive(): try: @@ -375,10 +375,7 @@ def test(args: argparse.Namespace, test_all_meta: dict) -> None: except Exception as term_e: logger.error(f"Error terminating process {p.name}: {term_e}") raise - - # Convert shared list to regular list scores = list(shared_scores) - logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}") diff --git a/run_multienv_openaicua_old.py b/run_multienv_openaicua_old.py new file mode 100644 index 0000000..c4eb18c --- /dev/null +++ b/run_multienv_openaicua_old.py @@ -0,0 +1,533 @@ +from __future__ import annotations +import argparse +import datetime +import json +import logging +import os +import sys +import signal +import time +from typing import List, Dict +import math +from tqdm import tqdm +from multiprocessing import Process, Manager +import lib_run_single +from desktop_env.desktop_env import DesktopEnv +from mm_agents.openai_cua_agent import OpenAICUAAgent + +# Global variables for signal handling +active_environments = [] +processes = [] +is_terminating = False + +# import wandb + +# load the environment variables from .env file +if os.path.exists(".env"): + from dotenv import load_dotenv + load_dotenv() + +# Logger Configs {{{ # +def config() -> argparse.Namespace: + parser = argparse.ArgumentParser( + description="Run end-to-end evaluation on the benchmark" + ) + + # environment config + parser.add_argument("--path_to_vm", type=str, default=None) + parser.add_argument( + "--headless", action="store_true", help="Run in headless machine" + ) + parser.add_argument( + "--action_space", type=str, default="pyautogui", help="Action type" + ) + parser.add_argument( + "--observation_type", + choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"], + default="screenshot", + help="Observation type", + ) + parser.add_argument("--sleep_after_execution", type=float, default=0.0) + parser.add_argument("--max_steps", type=int, default=15) + + # agent config + parser.add_argument("--max_trajectory_length", type=int, default=3) + parser.add_argument( + "--test_config_base_dir", type=str, default="evaluation_examples" + ) + + # lm config + parser.add_argument("--model", type=str, default="gpt-4o") + parser.add_argument("--temperature", type=float, default=1.0) + parser.add_argument("--top_p", type=float, default=0.9) + parser.add_argument("--max_tokens", type=int, default=1500) + parser.add_argument("--stop_token", type=str, default=None) + + # example config + parser.add_argument("--domain", type=str, default="all") + parser.add_argument( + "--test_all_meta_path", type=str, default="evaluation_examples/test_all.json" + ) + + # logging related + parser.add_argument("--result_dir", type=str, default="./results") + parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel") + parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help="Set the logging level") + # aws config + parser.add_argument( + "--region", type=str, default="us-east-1", help="AWS region for the VM" + ) + parser.add_argument( + "--provider_name", type=str, default="aws", choices=["aws", "virtualbox", "vmware", "docker", "azure"], help="Provider name" + ) + parser.add_argument( + "--client_password", type=str, default="", help="Client password" + ) + parser.add_argument( + "--screen_width", type=int, default=1920, help="Screen width" + ) + parser.add_argument( + "--screen_height", type=int, default=1080, help="Screen height" + ) + args = parser.parse_args() + return args + +args = config() # Get command line arguments first + +logger = logging.getLogger() +log_level = getattr(logging, args.log_level.upper()) +logger.setLevel(log_level) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler( + os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" +) +debug_handler = logging.FileHandler( + os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" +) +stdout_handler = logging.StreamHandler(sys.stdout) + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(log_level) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" +) +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + + +def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: + """Distribute tasks evenly across environments.""" + # Flatten the tasks into a single list + all_tasks = [] + for domain, examples in test_all_meta.items(): + for example_id in examples: + all_tasks.append((domain, example_id)) + + # Calculate tasks per environment + tasks_per_env = math.ceil(len(all_tasks) / num_envs) + + # Distribute tasks + distributed_tasks = [] + for i in range(num_envs): + env_tasks = {} + start_idx = i * tasks_per_env + end_idx = min((i + 1) * tasks_per_env, len(all_tasks)) + + for domain, example_id in all_tasks[start_idx:end_idx]: + if domain not in env_tasks: + env_tasks[domain] = [] + env_tasks[domain].append(example_id) + + distributed_tasks.append(env_tasks) + + return distributed_tasks + + +def process_signal_handler(signum, frame, env_idx): + """Signal handler for child processes to gracefully shut down their environments.""" + logger.info(f"Process {env_idx + 1} received signal {signum}. Shutting down...") + + # Get the active_environments from the caller's frame + local_vars = frame.f_locals + active_environments = local_vars.get('active_environments', []) + + # Close environment in the current process context + for env in active_environments: + if env is not None: + try: + logger.info(f"Process {env_idx + 1} closing environment...") + env.close() + logger.info(f"Process {env_idx + 1} environment closed successfully") + except Exception as e: + logger.error(f"Process {env_idx + 1} error closing environment: {e}") + + logger.info(f"Process {env_idx + 1} shutdown complete. Exiting.") + sys.exit(0) + + +def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, shared_scores: list): + """Run tasks for a single environment.""" + # Each process has its own list of active environments + active_environments = [] + env = None + + # Setup signal handlers for this process too + signal.signal(signal.SIGINT, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) + signal.signal(signal.SIGTERM, lambda signum, frame: process_signal_handler(signum, frame, env_idx)) + + from desktop_env.providers.aws.manager import IMAGE_ID_MAP + REGION = args.region + screen_size = (args.screen_width, args.screen_height) + ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)]) + env = DesktopEnv( + path_to_vm=args.path_to_vm, + action_space=args.action_space, + provider_name=args.provider_name, + region=REGION, + snapshot_name=ami_id, + screen_size=screen_size, + headless=args.headless, + os_type="Ubuntu", + require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"], + enable_proxy=True, + client_password=args.client_password + ) + active_environments.append(env) + agent = OpenAICUAAgent( + env=env, + model=args.model, + max_tokens=args.max_tokens, + top_p=args.top_p, + temperature=args.temperature, + action_space=args.action_space, + observation_type=args.observation_type, + max_trajectory_length=args.max_trajectory_length, + client_password=args.client_password, + provider_name=args.provider_name, + screen_width=args.screen_width, + screen_height=args.screen_height + ) + logger.info(f"Executing tasks in environment {env_idx + 1}/{args.num_envs}") + + try: + for domain in tqdm(env_tasks, desc=f"Env{env_idx+1}-Domain"): + for example_id in tqdm(env_tasks[domain], desc="Example", leave=False): + config_file = os.path.join( + args.test_config_base_dir, f"examples/{domain}/{example_id}.json" + ) + with open(config_file, "r", encoding="utf-8") as f: + example = json.load(f) + + logger.info(f"[Env {env_idx+1}][Domain]: {domain}") + logger.info(f"[Env {env_idx+1}][Example ID]: {example_id}") + logger.info(f"[Env {env_idx+1}][Instruction]: {example['instruction']}") + + example_result_dir = os.path.join( + args.result_dir, + args.action_space, + args.observation_type, + args.model, + domain, + example_id, + ) + os.makedirs(example_result_dir, exist_ok=True) + + try: + lib_run_single.run_single_example_openaicua( + agent, + env, + example, + args.max_steps, + example["instruction"], + args, + example_result_dir, + shared_scores, + ) + except Exception as e: + import traceback + logger.error(f"Exception in Env{env_idx+1} {domain}/{example_id}: {e}") + logger.error(traceback.format_exc()) + try: + env.controller.end_recording( + os.path.join(example_result_dir, "recording.mp4") + ) + except Exception as rec_e: + logger.error(f"Failed to end recording: {rec_e}") + + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write( + json.dumps( + {"Error": f"{domain}/{example_id} - {e}"} + ) + ) + f.write("\n") + finally: + # This ensures the environment is closed even if there's an exception + logger.info(f"Process {env_idx + 1} cleaning up environment...") + try: + env.close() + logger.info(f"Process {env_idx + 1} environment closed successfully") + except Exception as e: + logger.error(f"Process {env_idx + 1} error during environment cleanup: {e}") + + +def signal_handler(signum, frame): + """Handle termination signals (SIGINT, SIGTERM) to gracefully shutdown environments.""" + global is_terminating, active_environments, processes + + # Avoid duplicate handling + if is_terminating: + return + + is_terminating = True + logger.info(f"Received signal {signum}. Gracefully shutting down...") + + # Close all registered environments in the main process + for env in active_environments: + try: + logger.info(f"Closing environment...") + env.close() + logger.info(f"Environment closed successfully") + except Exception as e: + logger.error(f"Error closing environment: {e}") + + # Send termination signal to all child processes first + for p in processes: + if p.is_alive(): + try: + logger.info(f"Sending termination signal to process {p.name}...") + p.terminate() + except Exception as e: + logger.error(f"Error sending termination signal to process: {e}") + + # Allow a short time for processes to handle their own cleanup + time.sleep(1) + + # Forcefully terminate any processes that didn't exit + for p in processes: + if p.is_alive(): + try: + logger.info(f"Forcefully terminating process {p.name}...") + import signal + os.kill(p.pid, signal.SIGKILL) + except Exception as e: + logger.error(f"Error forcefully terminating process: {e}") + + logger.info("Shutdown complete. Exiting.") + sys.exit(0) + + +def test(args: argparse.Namespace, test_all_meta: dict) -> None: + global processes + logger.info("Args: %s", args) + + distributed_tasks = distribute_tasks(test_all_meta, args.num_envs) + + logger.info("All environments are ready. Starting parallel task execution...") + + # Create a shared list for scores across processes + with Manager() as manager: + shared_scores = manager.list() + + # Create and start processes for each environment + processes = [] + for env_idx, env_tasks in enumerate(distributed_tasks): + p = Process( + target=run_env_tasks, + args=(env_idx, env_tasks, args, shared_scores) + ) + processes.append(p) + p.start() + logger.info(f"Started process {p.name} with PID {p.pid}") + + try: + # Wait for all processes to complete + for p in processes: + p.join() + logger.info(f"Process {p.name} completed") + except KeyboardInterrupt: + logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...") + # Let the signal handler do the cleanup + raise + except Exception as e: + logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True) + # Ensure cleanup happens + for p in processes: + if p.is_alive(): + try: + logger.info(f"Terminating process {p.name} due to error...") + p.terminate() + except Exception as term_e: + logger.error(f"Error terminating process {p.name}: {term_e}") + raise + + # Convert shared list to regular list + scores = list(shared_scores) + + logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}") + + +def get_unfinished( + action_space, use_model, observation_type, result_dir, total_file_json +): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + + if not os.path.exists(target_dir): + return total_file_json + + finished = {} + for domain in os.listdir(target_dir): + finished[domain] = [] + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + if example_id == "onboard": + continue + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" not in os.listdir(example_path): + # empty all files under example_id + for file in os.listdir(example_path): + os.remove(os.path.join(example_path, file)) + else: + finished[domain].append(example_id) + + if not finished: + return total_file_json + + for domain, examples in finished.items(): + if domain in total_file_json: + total_file_json[domain] = [ + x for x in total_file_json[domain] if x not in examples + ] + + return total_file_json + + +def get_result(action_space, use_model, observation_type, result_dir, total_file_json): + target_dir = os.path.join(result_dir, action_space, observation_type, use_model) + if not os.path.exists(target_dir): + print("New experiment, no result yet.") + return None + + all_result = [] + + for domain in os.listdir(target_dir): + domain_path = os.path.join(target_dir, domain) + if os.path.isdir(domain_path): + for example_id in os.listdir(domain_path): + example_path = os.path.join(domain_path, example_id) + if os.path.isdir(example_path): + if "result.txt" in os.listdir(example_path): + # empty all files under example_id + try: + all_result.append( + float( + open( + os.path.join(example_path, "result.txt"), "r" + ).read() + ) + ) + except: + all_result.append(0.0) + + if not all_result: + print("New experiment, no result yet.") + return None + else: + print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%") + return all_result + + +if __name__ == "__main__": + ####### The complete version of the list of examples ####### + os.environ["TOKENIZERS_PARALLELISM"] = "false" + + # Register signal handlers for graceful termination + signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C + signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal + + try: + args = config() + + with open(args.test_all_meta_path, "r", encoding="utf-8") as f: + test_all_meta = json.load(f) + + if args.domain != "all": + test_all_meta = {args.domain: test_all_meta[args.domain]} + + test_file_list = get_unfinished( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + left_info = "" + for domain in test_file_list: + left_info += f"{domain}: {len(test_file_list[domain])}\n" + logger.info(f"Left tasks:\n{left_info}") + + get_result( + args.action_space, + args.model, + args.observation_type, + args.result_dir, + test_all_meta, + ) + test(args, test_file_list) + except KeyboardInterrupt: + logger.info("Main process received KeyboardInterrupt.") + # Signal handler will take care of cleanup + except Exception as e: + logger.error(f"Unexpected error in main process: {e}", exc_info=True) + # Also trigger cleanup for unhandled exceptions + signal_handler(signal.SIGTERM, None) + finally: + # Final cleanup in case any environments or processes remain + logger.info("Main process final cleanup...") + for env in active_environments: + if env is not None: + try: + logger.info(f"Closing environment in final cleanup...") + env.close() + logger.info(f"Environment closed successfully in final cleanup") + except Exception as e: + logger.error(f"Error during final environment cleanup: {e}") + + # First try gentle termination + for p in processes: + if p is not None and p.is_alive(): + try: + logger.info(f"Terminating process {p.name}...") + p.terminate() + except Exception as e: + logger.error(f"Error terminating process: {e}") + + # Wait a moment for processes to terminate + time.sleep(1) + + # Then force kill if needed + for p in processes: + if p is not None and p.is_alive(): + try: + logger.info(f"Force killing process {p.name}...") + os.kill(p.pid, signal.SIGKILL) + logger.info(f"Process {p.name} force killed") + except Exception as e: + logger.error(f"Error force killing process: {e}")