From b87cbe69e5d81e4bdb32a0e9ef0edd53cf1c9485 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Mon, 2 Jun 2025 13:34:20 +0000 Subject: [PATCH 1/7] add monitor --- monitor/.env | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/monitor/.env b/monitor/.env index d28ba42..70eb212 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_small.json +TASK_CONFIG_PATH=../evaluation_examples/test_small_debug.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_aws/pyautogui/screenshot/computer-use-preview +RESULTS_BASE_PATH=../results_operator_aws2/pyautogui/screenshot/computer-use-preview MAX_STEPS=50 FLASK_PORT=80 FLASK_HOST=0.0.0.0 From b211df33856ac43591b4acbfc69849ecbc0e9fd1 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Wed, 4 Jun 2025 10:23:45 +0000 Subject: [PATCH 2/7] fix timeout --- evaluation_examples/test_bug.json | 5 +++++ evaluation_examples/test_small_test.json | 25 +++++++++++++++++++++++ evaluation_examples/test_small_test2.json | 18 ++++++++++++++++ mm_agents/openai_cua_agent.py | 10 ++------- monitor/.env | 6 +++--- run_operator.sh | 7 +++---- 6 files changed, 56 insertions(+), 15 deletions(-) create mode 100644 evaluation_examples/test_bug.json create mode 100644 evaluation_examples/test_small_test.json create mode 100644 evaluation_examples/test_small_test2.json diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json new file mode 100644 index 0000000..8f2e521 --- /dev/null +++ b/evaluation_examples/test_bug.json @@ -0,0 +1,5 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test.json b/evaluation_examples/test_small_test.json new file mode 100644 index 0000000..644ea9a --- /dev/null +++ b/evaluation_examples/test_small_test.json @@ -0,0 +1,25 @@ +{ + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c" + ], + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "716a6079-22da-47f1-ba73-c9d58f986a38" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8f080098-ddb1-424c-b438-4e96e5e4786e" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json new file mode 100644 index 0000000..d7ea04b --- /dev/null +++ b/evaluation_examples/test_small_test2.json @@ -0,0 +1,18 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index e930091..34098dc 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -301,8 +301,7 @@ class OpenAICUAAgent: Raises: requests.exceptions.RequestException: If the API request fails """ - retry_count = 0 - while retry_count < 3: + while True: try: from openai import OpenAI client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA")) @@ -319,13 +318,8 @@ class OpenAICUAAgent: logger.info(f"Response: {response}") return response except Exception as e: - logger.error(f"OpenAI API error: {str(e)}") - new_screenshot = self.env._get_obs() - new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8') - self.cua_messages[-1]["output"]["image_url"] = f"data:image/png;base64,{new_screenshot_base64}" - retry_count += 1 + logger.error(f"OpenAI API error: {str(e)},will retry in 1s...") time.sleep(1) - raise Exception("Failed to make OpenAI API call after 3 retries") def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]: """Parse a response item from the OpenAI API. diff --git a/monitor/.env b/monitor/.env index 70eb212..a55e5c7 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,10 +2,10 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_small_debug.json +TASK_CONFIG_PATH=../evaluation_examples/test_small_test2.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_aws2/pyautogui/screenshot/computer-use-preview -MAX_STEPS=50 +RESULTS_BASE_PATH=../results_operator_timeoutcheck3/pyautogui/screenshot/computer-use-preview +MAX_STEPS=150 FLASK_PORT=80 FLASK_HOST=0.0.0.0 FLASK_DEBUG=true \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh index 9a84e92..72d187b 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,8 +2,7 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_operator_aws_new \ ---test_all_meta_path evaluation_examples/test_small_debug.json \ +--result_dir ./results_operator_timeoutcheck3 \ +--test_all_meta_path evaluation_examples/test_small_test2.json \ --region us-east-1 \ ---max_steps 150 \ ---num_envs 5 +--max_steps 150 From 71578d994e21b67e586110db4573e3b304fb1936 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Thu, 5 Jun 2025 13:29:16 +0000 Subject: [PATCH 3/7] edit --- evaluation_examples/test_bug.json | 4 ++-- evaluation_examples/test_small_test2 copy.json | 18 ++++++++++++++++++ evaluation_examples/test_small_test2.json | 13 ------------- 3 files changed, 20 insertions(+), 15 deletions(-) create mode 100644 evaluation_examples/test_small_test2 copy.json diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json index 8f2e521..5f5d9ed 100644 --- a/evaluation_examples/test_bug.json +++ b/evaluation_examples/test_bug.json @@ -1,5 +1,5 @@ { - "multi_apps": [ - "46407397-a7d5-4c6b-92c6-dbe038b1457b" + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" ] } \ No newline at end of file diff --git a/evaluation_examples/test_small_test2 copy.json b/evaluation_examples/test_small_test2 copy.json new file mode 100644 index 0000000..d7ea04b --- /dev/null +++ b/evaluation_examples/test_small_test2 copy.json @@ -0,0 +1,18 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json index d7ea04b..b08581a 100644 --- a/evaluation_examples/test_small_test2.json +++ b/evaluation_examples/test_small_test2.json @@ -1,17 +1,4 @@ { - "multi_apps": [ - "46407397-a7d5-4c6b-92c6-dbe038b1457b", - "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", - "897e3b53-5d4d-444b-85cb-2cdc8a97d903", - "c867c42d-a52d-4a24-8ae3-f75d256b5618" - ], - "os": [ - "5812b315-e7bd-4265-b51f-863c02174c28" - ], - "thunderbird": [ - "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", - "15c3b339-88f7-4a86-ab16-e71c58dcb01e" - ], "vlc": [ "59f21cfb-0120-4326-b255-a5b827b38967" ] From 4ea24ddfd31fd74ef3ee194179a533b80c96a0b4 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Fri, 6 Jun 2025 09:41:22 +0000 Subject: [PATCH 4/7] add proxy --- dataimpulse_proxy_config.json | 13 ++++ .../providers/aws/provider_with_proxy.py | 76 +++++++++---------- .../d1acdb87-bb67-4f30-84aa-990e56a09c92.json | 2 +- evaluation_examples/test_all_error.json | 38 ++++++++++ evaluation_examples/test_all_error2.json | 14 ++++ evaluation_examples/test_bug.json | 5 +- monitor/.env | 4 +- run_operator.sh | 4 +- 8 files changed, 112 insertions(+), 44 deletions(-) create mode 100644 dataimpulse_proxy_config.json create mode 100644 evaluation_examples/test_all_error.json create mode 100644 evaluation_examples/test_all_error2.json diff --git a/dataimpulse_proxy_config.json b/dataimpulse_proxy_config.json new file mode 100644 index 0000000..596e171 --- /dev/null +++ b/dataimpulse_proxy_config.json @@ -0,0 +1,13 @@ +[ + { + "host": "gw.dataimpulse.com", + "port": 823, + "username": "fba5ac061fe18be70c6c", + "password": "3b5669b6640fc80c", + "protocol": "http", + "provider": "dataimpulse", + "type": "residential", + "country": "US", + "note": "Dataimpulse Residential Proxy --- Tianbao Xie" + } +] \ No newline at end of file diff --git a/desktop_env/providers/aws/provider_with_proxy.py b/desktop_env/providers/aws/provider_with_proxy.py index 309e71b..0f5860a 100644 --- a/desktop_env/providers/aws/provider_with_proxy.py +++ b/desktop_env/providers/aws/provider_with_proxy.py @@ -47,48 +47,48 @@ class AWSProviderWithProxy(Provider): proxy_url = self._format_proxy_url(self.current_proxy) user_data_script = f"""#!/bin/bash -# 配置系统代理 -echo 'export http_proxy={proxy_url}' >> /etc/environment -echo 'export https_proxy={proxy_url}' >> /etc/environment -echo 'export HTTP_PROXY={proxy_url}' >> /etc/environment -echo 'export HTTPS_PROXY={proxy_url}' >> /etc/environment + # 配置系统代理 + echo 'export http_proxy={proxy_url}' >> /etc/environment + echo 'export https_proxy={proxy_url}' >> /etc/environment + echo 'export HTTP_PROXY={proxy_url}' >> /etc/environment + echo 'export HTTPS_PROXY={proxy_url}' >> /etc/environment -# 配置apt代理 -cat > /etc/apt/apt.conf.d/95proxy << EOF -Acquire::http::Proxy "{proxy_url}"; -Acquire::https::Proxy "{proxy_url}"; -EOF + # 配置apt代理 + cat > /etc/apt/apt.conf.d/95proxy << EOF + Acquire::http::Proxy "{proxy_url}"; + Acquire::https::Proxy "{proxy_url}"; + EOF -# 配置chrome/chromium代理 -mkdir -p /etc/opt/chrome/policies/managed -cat > /etc/opt/chrome/policies/managed/proxy.json << EOF -{{ - "ProxyMode": "fixed_servers", - "ProxyServer": "{self.current_proxy.host}:{self.current_proxy.port}" -}} -EOF + # 配置chrome/chromium代理 + mkdir -p /etc/opt/chrome/policies/managed + cat > /etc/opt/chrome/policies/managed/proxy.json << EOF + {{ + "ProxyMode": "fixed_servers", + "ProxyServer": "{self.current_proxy.host}:{self.current_proxy.port}" + }} + EOF -# 配置firefox代理 -mkdir -p /etc/firefox/policies -cat > /etc/firefox/policies/policies.json << EOF -{{ - "policies": {{ - "Proxy": {{ - "Mode": "manual", - "HTTPProxy": "{self.current_proxy.host}:{self.current_proxy.port}", - "HTTPSProxy": "{self.current_proxy.host}:{self.current_proxy.port}", - "UseHTTPProxyForAllProtocols": true - }} - }} -}} -EOF + # 配置firefox代理 + mkdir -p /etc/firefox/policies + cat > /etc/firefox/policies/policies.json << EOF + {{ + "policies": {{ + "Proxy": {{ + "Mode": "manual", + "HTTPProxy": "{self.current_proxy.host}:{self.current_proxy.port}", + "HTTPSProxy": "{self.current_proxy.host}:{self.current_proxy.port}", + "UseHTTPProxyForAllProtocols": true + }} + }} + }} + EOF -# 重新加载环境变量 -source /etc/environment + # 重新加载环境变量 + source /etc/environment -# 记录代理配置日志 -echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.port}" >> /var/log/proxy-setup.log -""" + # 记录代理配置日志 + echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.port}" >> /var/log/proxy-setup.log + """ return base64.b64encode(user_data_script.encode()).decode() @@ -99,7 +99,7 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po else: return f"{proxy.protocol}://{proxy.host}:{proxy.port}" - def start_emulator(self, path_to_vm: str, headless: bool): + def start_emulator(self, path_to_vm: str, headless: bool, os_type: str): logger.info("Starting AWS VM with proxy configuration...") ec2_client = boto3.client('ec2', region_name=self.region) diff --git a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json index 8e42989..91c7530 100644 --- a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json +++ b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json @@ -139,5 +139,5 @@ ] } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/test_all_error.json b/evaluation_examples/test_all_error.json new file mode 100644 index 0000000..fe33a13 --- /dev/null +++ b/evaluation_examples/test_all_error.json @@ -0,0 +1,38 @@ +{ + "chrome": [ + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "libreoffice_calc": [ + "2bd59342-0664-4ccb-ba87-79379096cc08", + "7efeb4b1-3d19-4762-b163-63328d66303b", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14" + ], + "libreoffice_impress": [ + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "e4ef0baf-4b52-4590-a47e-d4d464cca2d7" + ], + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "78aed49a-a710-4321-a793-b611a7c5b56b", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "0c825995-5b70-4526-b663-113f4c999dd2", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "e2392362-125e-4f76-a2ee-524b183a3412", + "22a4636f-8179-4357-8e87-d1743ece1f81" + ], + "os": [ + "a462a795-fdc7-4b23-b689-e8b6df786b78" + ], + "vlc": [ + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "7882ed6e-bece-4bf0-bada-c32dc1ddae72" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_all_error2.json b/evaluation_examples/test_all_error2.json new file mode 100644 index 0000000..4156b91 --- /dev/null +++ b/evaluation_examples/test_all_error2.json @@ -0,0 +1,14 @@ +{ + "libreoffice_calc": [ + "7efeb4b1-3d19-4762-b163-63328d66303b" + ], + "libreoffice_impress": [ + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb" + ], + "multi_apps": [ + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "e2392362-125e-4f76-a2ee-524b183a3412" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json index 5f5d9ed..da94c93 100644 --- a/evaluation_examples/test_bug.json +++ b/evaluation_examples/test_bug.json @@ -1,5 +1,8 @@ { "chrome": [ - "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" + "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc" + ], + "multi_apps": [ + "d1acdb87-bb67-4f30-84aa-990e56a09c92" ] } \ No newline at end of file diff --git a/monitor/.env b/monitor/.env index 0333301..3851fc6 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_small_test2.json +TASK_CONFIG_PATH=../evaluation_examples/test_all_error.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_timeoutcheck3/pyautogui/screenshot/computer-use-preview +RESULTS_BASE_PATH=../results_all_error/pyautogui/screenshot/computer-use-preview MAX_STEPS=150 FLASK_PORT=80 FLASK_HOST=0.0.0.0 diff --git a/run_operator.sh b/run_operator.sh index 72d187b..b6a3750 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,7 +2,7 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_operator_timeoutcheck3 \ ---test_all_meta_path evaluation_examples/test_small_test2.json \ +--result_dir ./results_all_error \ +--test_all_meta_path evaluation_examples/test_all_error.json \ --region us-east-1 \ --max_steps 150 From a146c1e0b7789a2d8f03cb2d10eb23789a1908dc Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 7 Jun 2025 05:21:04 +0000 Subject: [PATCH 5/7] edit prompt --- .../chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json | 2 +- .../da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json | 2 +- evaluation_examples/test_prompt_message.json | 11 +++++++++++ evaluation_examples/test_proxy.json | 11 +++++++++++ mm_agents/openai_cua_agent.py | 9 ++++++--- run_operator.sh | 4 ++-- 6 files changed, 32 insertions(+), 7 deletions(-) create mode 100644 evaluation_examples/test_prompt_message.json create mode 100644 evaluation_examples/test_proxy.json diff --git a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json index a569c9d..254244b 100644 --- a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json +++ b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json @@ -271,5 +271,5 @@ } } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json index d56f64f..7b511db 100644 --- a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json +++ b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json @@ -70,5 +70,5 @@ "dest": "book_list_result.docx" } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/test_prompt_message.json b/evaluation_examples/test_prompt_message.json new file mode 100644 index 0000000..5131b1d --- /dev/null +++ b/evaluation_examples/test_prompt_message.json @@ -0,0 +1,11 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "fc6d8143-9452-4171-9459-7f515143419a", + "47543840-672a-467d-80df-8f7c3b9788c9" + ], + "multi_apps": [ + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_proxy.json b/evaluation_examples/test_proxy.json new file mode 100644 index 0000000..96415e3 --- /dev/null +++ b/evaluation_examples/test_proxy.json @@ -0,0 +1,11 @@ +{ + "chrome": [ + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "fc6d8143-9452-4171-9459-7f515143419a", + "47543840-672a-467d-80df-8f7c3b9788c9" + ], + "multi_apps": [ + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index f3b8a65..09e9657 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -38,8 +38,9 @@ OPERATOR_PROMPT = """Here are some helpful tips: (3) My computer's password is “password”, feel free to use it when you need sudo rights. (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”. (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one. -(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation. -(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.""" +(6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”. +(7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. Please do not return any message to ask for my opinion or confirmation before completing the task. +""" class Action: @@ -679,7 +680,9 @@ class OpenAICUAAgent: state_correct = False # if action_exit and thought_exit: # state_correct = True - if action_exit and not message_exit: + #if action_exit and not message_exit: + # state_correct = True + if action_exit: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit) diff --git a/run_operator.sh b/run_operator.sh index b6a3750..f46d17f 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,7 +2,7 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_all_error \ ---test_all_meta_path evaluation_examples/test_all_error.json \ +--result_dir ./results_proxy \ +--test_all_meta_path evaluation_examples/test_proxy.json \ --region us-east-1 \ --max_steps 150 From e61acece8444c717918e6733586c32d7be6c2f2b Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 7 Jun 2025 05:30:40 +0000 Subject: [PATCH 6/7] problems from the community --- evaluation_examples/test_bug.json | 8 -------- evaluation_examples/test_bug_from_community.json | 5 +++++ 2 files changed, 5 insertions(+), 8 deletions(-) delete mode 100644 evaluation_examples/test_bug.json create mode 100644 evaluation_examples/test_bug_from_community.json diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json deleted file mode 100644 index da94c93..0000000 --- a/evaluation_examples/test_bug.json +++ /dev/null @@ -1,8 +0,0 @@ -{ - "chrome": [ - "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc" - ], - "multi_apps": [ - "d1acdb87-bb67-4f30-84aa-990e56a09c92" - ] -} \ No newline at end of file diff --git a/evaluation_examples/test_bug_from_community.json b/evaluation_examples/test_bug_from_community.json new file mode 100644 index 0000000..d6e5531 --- /dev/null +++ b/evaluation_examples/test_bug_from_community.json @@ -0,0 +1,5 @@ +{ + "vlc": [ + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f" + ] +} \ No newline at end of file From 4ade4114da95e7f4fffdee99b553695970696804 Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Sat, 7 Jun 2025 06:50:15 +0000 Subject: [PATCH 7/7] add problems from the community --- run_operator.sh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/run_operator.sh b/run_operator.sh index f46d17f..24bc064 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,7 +2,8 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_proxy \ ---test_all_meta_path evaluation_examples/test_proxy.json \ +--result_dir ./results_small_retest \ +--test_all_meta_path evaluation_examples/test_small.json \ --region us-east-1 \ ---max_steps 150 +--max_steps 150 \ +--num_envs 10