diff --git a/dataimpulse_proxy_config.json b/dataimpulse_proxy_config.json new file mode 100644 index 0000000..596e171 --- /dev/null +++ b/dataimpulse_proxy_config.json @@ -0,0 +1,13 @@ +[ + { + "host": "gw.dataimpulse.com", + "port": 823, + "username": "fba5ac061fe18be70c6c", + "password": "3b5669b6640fc80c", + "protocol": "http", + "provider": "dataimpulse", + "type": "residential", + "country": "US", + "note": "Dataimpulse Residential Proxy --- Tianbao Xie" + } +] \ No newline at end of file diff --git a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json index a569c9d..254244b 100644 --- a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json +++ b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json @@ -271,5 +271,5 @@ } } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json index 8e42989..91c7530 100644 --- a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json +++ b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json @@ -139,5 +139,5 @@ ] } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json index d56f64f..7b511db 100644 --- a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json +++ b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json @@ -70,5 +70,5 @@ "dest": "book_list_result.docx" } }, - "proxy": false + "proxy": true } \ No newline at end of file diff --git a/evaluation_examples/test_all_error.json b/evaluation_examples/test_all_error.json new file mode 100644 index 0000000..fe33a13 --- /dev/null +++ b/evaluation_examples/test_all_error.json @@ -0,0 +1,38 @@ +{ + "chrome": [ + "59155008-fe71-45ec-8a8f-dc35497b6aa8", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "libreoffice_calc": [ + "2bd59342-0664-4ccb-ba87-79379096cc08", + "7efeb4b1-3d19-4762-b163-63328d66303b", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14" + ], + "libreoffice_impress": [ + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb", + "bf4e9888-f10f-47af-8dba-76413038b73c", + "e4ef0baf-4b52-4590-a47e-d4d464cca2d7" + ], + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "78aed49a-a710-4321-a793-b611a7c5b56b", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "0c825995-5b70-4526-b663-113f4c999dd2", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "e2392362-125e-4f76-a2ee-524b183a3412", + "22a4636f-8179-4357-8e87-d1743ece1f81" + ], + "os": [ + "a462a795-fdc7-4b23-b689-e8b6df786b78" + ], + "vlc": [ + "8f080098-ddb1-424c-b438-4e96e5e4786e", + "bba3381f-b5eb-4439-bd9e-80c22218d5a7", + "7882ed6e-bece-4bf0-bada-c32dc1ddae72" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_all_error2.json b/evaluation_examples/test_all_error2.json new file mode 100644 index 0000000..4156b91 --- /dev/null +++ b/evaluation_examples/test_all_error2.json @@ -0,0 +1,14 @@ +{ + "libreoffice_calc": [ + "7efeb4b1-3d19-4762-b163-63328d66303b" + ], + "libreoffice_impress": [ + "455d3c66-7dc6-4537-a39a-36d3e9119df7", + "3b27600c-3668-4abd-8f84-7bcdebbccbdb" + ], + "multi_apps": [ + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "e2392362-125e-4f76-a2ee-524b183a3412" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_bug_from_community.json b/evaluation_examples/test_bug_from_community.json new file mode 100644 index 0000000..d6e5531 --- /dev/null +++ b/evaluation_examples/test_bug_from_community.json @@ -0,0 +1,5 @@ +{ + "vlc": [ + "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_prompt_message.json b/evaluation_examples/test_prompt_message.json new file mode 100644 index 0000000..5131b1d --- /dev/null +++ b/evaluation_examples/test_prompt_message.json @@ -0,0 +1,11 @@ +{ + "chrome": [ + "bb5e4c0d-f964-439c-97b6-bdb9747de3f4", + "fc6d8143-9452-4171-9459-7f515143419a", + "47543840-672a-467d-80df-8f7c3b9788c9" + ], + "multi_apps": [ + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_proxy.json b/evaluation_examples/test_proxy.json new file mode 100644 index 0000000..96415e3 --- /dev/null +++ b/evaluation_examples/test_proxy.json @@ -0,0 +1,11 @@ +{ + "chrome": [ + "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938", + "fc6d8143-9452-4171-9459-7f515143419a", + "47543840-672a-467d-80df-8f7c3b9788c9" + ], + "multi_apps": [ + "da52d699-e8d2-4dc5-9191-a2199e0b6a9b", + "67890eb6-6ce5-4c00-9e3d-fb4972699b06" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test.json b/evaluation_examples/test_small_test.json new file mode 100644 index 0000000..644ea9a --- /dev/null +++ b/evaluation_examples/test_small_test.json @@ -0,0 +1,25 @@ +{ + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c" + ], + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "716a6079-22da-47f1-ba73-c9d58f986a38" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8f080098-ddb1-424c-b438-4e96e5e4786e" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test2 copy.json b/evaluation_examples/test_small_test2 copy.json new file mode 100644 index 0000000..d7ea04b --- /dev/null +++ b/evaluation_examples/test_small_test2 copy.json @@ -0,0 +1,18 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json new file mode 100644 index 0000000..b08581a --- /dev/null +++ b/evaluation_examples/test_small_test2.json @@ -0,0 +1,5 @@ +{ + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index d3fefbd..09e9657 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -38,8 +38,9 @@ OPERATOR_PROMPT = """Here are some helpful tips: (3) My computer's password is “password”, feel free to use it when you need sudo rights. (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”. (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one. -(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation. -(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.""" +(6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”. +(7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. Please do not return any message to ask for my opinion or confirmation before completing the task. +""" class Action: @@ -288,8 +289,7 @@ class OpenAICUAAgent: Raises: requests.exceptions.RequestException: If the API request fails """ - retry_count = 0 - while retry_count < 3: + while True: try: from openai import OpenAI client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA")) @@ -326,7 +326,6 @@ class OpenAICUAAgent: retry_count += 1 time.sleep(1) - raise Exception("Failed to make OpenAI API call after 3 retries") def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]: """Parse a response item from the OpenAI API. @@ -681,7 +680,9 @@ class OpenAICUAAgent: state_correct = False # if action_exit and thought_exit: # state_correct = True - if action_exit and not message_exit: + #if action_exit and not message_exit: + # state_correct = True + if action_exit: state_correct = True if not state_correct: logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit) diff --git a/monitor/.env b/monitor/.env index a6bfc16..dee5ca8 100644 --- a/monitor/.env +++ b/monitor/.env @@ -11,4 +11,4 @@ MODEL_NAME=computer-use-preview MAX_STEPS=150 FLASK_PORT=80 FLASK_HOST=0.0.0.0 -FLASK_DEBUG=true +FLASK_DEBUG=true \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh index 9a84e92..24bc064 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,8 +2,8 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_operator_aws_new \ ---test_all_meta_path evaluation_examples/test_small_debug.json \ +--result_dir ./results_small_retest \ +--test_all_meta_path evaluation_examples/test_small.json \ --region us-east-1 \ --max_steps 150 \ ---num_envs 5 +--num_envs 10