diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json new file mode 100644 index 0000000..8f2e521 --- /dev/null +++ b/evaluation_examples/test_bug.json @@ -0,0 +1,5 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test.json b/evaluation_examples/test_small_test.json new file mode 100644 index 0000000..644ea9a --- /dev/null +++ b/evaluation_examples/test_small_test.json @@ -0,0 +1,25 @@ +{ + "libreoffice_writer": [ + "0810415c-bde4-4443-9047-d5f70165a697", + "0a0faba3-5580-44df-965d-f562a99b291c" + ], + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "716a6079-22da-47f1-ba73-c9d58f986a38" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8f080098-ddb1-424c-b438-4e96e5e4786e" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json new file mode 100644 index 0000000..d7ea04b --- /dev/null +++ b/evaluation_examples/test_small_test2.json @@ -0,0 +1,18 @@ +{ + "multi_apps": [ + "46407397-a7d5-4c6b-92c6-dbe038b1457b", + "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc", + "897e3b53-5d4d-444b-85cb-2cdc8a97d903", + "c867c42d-a52d-4a24-8ae3-f75d256b5618" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index e930091..34098dc 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -301,8 +301,7 @@ class OpenAICUAAgent: Raises: requests.exceptions.RequestException: If the API request fails """ - retry_count = 0 - while retry_count < 3: + while True: try: from openai import OpenAI client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA")) @@ -319,13 +318,8 @@ class OpenAICUAAgent: logger.info(f"Response: {response}") return response except Exception as e: - logger.error(f"OpenAI API error: {str(e)}") - new_screenshot = self.env._get_obs() - new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8') - self.cua_messages[-1]["output"]["image_url"] = f"data:image/png;base64,{new_screenshot_base64}" - retry_count += 1 + logger.error(f"OpenAI API error: {str(e)},will retry in 1s...") time.sleep(1) - raise Exception("Failed to make OpenAI API call after 3 retries") def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]: """Parse a response item from the OpenAI API. diff --git a/monitor/.env b/monitor/.env index 70eb212..a55e5c7 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,10 +2,10 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_small_debug.json +TASK_CONFIG_PATH=../evaluation_examples/test_small_test2.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_operator_aws2/pyautogui/screenshot/computer-use-preview -MAX_STEPS=50 +RESULTS_BASE_PATH=../results_operator_timeoutcheck3/pyautogui/screenshot/computer-use-preview +MAX_STEPS=150 FLASK_PORT=80 FLASK_HOST=0.0.0.0 FLASK_DEBUG=true \ No newline at end of file diff --git a/run_operator.sh b/run_operator.sh index 9a84e92..72d187b 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,8 +2,7 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_operator_aws_new \ ---test_all_meta_path evaluation_examples/test_small_debug.json \ +--result_dir ./results_operator_timeoutcheck3 \ +--test_all_meta_path evaluation_examples/test_small_test2.json \ --region us-east-1 \ ---max_steps 150 \ ---num_envs 5 +--max_steps 150