diff --git a/evaluation_examples/test_bug_0608.json b/evaluation_examples/test_bug_0608.json new file mode 100644 index 0000000..baf870b --- /dev/null +++ b/evaluation_examples/test_bug_0608.json @@ -0,0 +1,11 @@ +{ + "libreoffice_writer": [ + "0b17a146-2934-46c7-8727-73ff6b6483e8" + ], + "libreoffice_calc": [ + "1954cced-e748-45c4-9c26-9855b97fbc5e" + ], + "vlc": [ + "fba2c100-79e8-42df-ae74-b592418d54f4" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_impress.json b/evaluation_examples/test_impress.json new file mode 100644 index 0000000..75d58b0 --- /dev/null +++ b/evaluation_examples/test_impress.json @@ -0,0 +1,5 @@ +{ + "libreoffice_impress": [ + "0a211154-fda0-48d0-9274-eaac4ce5486d" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_rest0608.json b/evaluation_examples/test_rest0608.json new file mode 100644 index 0000000..fe70f88 --- /dev/null +++ b/evaluation_examples/test_rest0608.json @@ -0,0 +1,66 @@ +{ + "chrome": [ + "1704f00f-79e6-43a7-961b-cedd3724d5fd", + "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217", + "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825" + ], + "gimp": [ + "62f7fd55-0687-4a43-b6e1-3eda16fc6252" + ], + "libreoffice_calc": [ + "1954cced-e748-45c4-9c26-9855b97fbc5e", + "035f41ba-6653-43ab-aa63-c86d449d62e5", + "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14", + "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17", + "21ab7b40-77c2-4ae6-8321-e00d3a086c73" + ], + "libreoffice_impress": [ + "5d901039-a89c-4bfb-967b-bf66f4df075e", + "15aece23-a215-4579-91b4-69eec72e18da", + "a434992a-89df-4577-925c-0c58b747f0f4", + "af2d657a-e6b3-4c6a-9f67-9e3ed015974c", + "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a", + "04578141-1d42-4146-b9cf-6fab4ce5fd74" + ], + "libreoffice_writer": [ + "0b17a146-2934-46c7-8727-73ff6b6483e8", + "adf5e2c3-64c7-4644-b7b6-d2f0167927e7", + "ecc2413d-8a48-416e-a3a2-d30106ca36cb" + ], + "multi_apps": [ + "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb", + "b52b40a5-ad70-4c53-b5b0-5650a8387052", + "f8cfa149-d1c1-4215-8dac-4a0932bad3c2", + "337d318b-aa07-4f4f-b763-89d9a2dd013f", + "869de13e-bef9-4b91-ba51-f6708c40b096", + "3a93cae4-ad3e-403e-8c12-65303b271818", + "9219480b-3aed-47fc-8bac-d2cffc5849f7", + "7e287123-70ca-47b9-8521-47db09b69b14", + "e2392362-125e-4f76-a2ee-524b183a3412", + "873cafdd-a581-47f6-8b33-b9696ddb7b05", + "b337d106-053f-4d37-8da0-7f9c4043a66b", + "20236825-b5df-46e7-89bf-62e1d640a897", + "02ce9a50-7af2-47ed-8596-af0c230501f8", + "3e3fc409-bff3-4905-bf16-c968eee3f807", + "f5c13cdd-205c-4719-a562-348ae5cd1d91", + "7ff48d5b-2df2-49da-b500-a5150ffc7f18", + "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56", + "788b3701-3ec9-4b67-b679-418bfa726c22", + "d68204bf-11c1-4b13-b48b-d303c73d4bf6", + "aceb0368-56b8-4073-b70e-3dc9aee184e0", + "22a4636f-8179-4357-8e87-d1743ece1f81" + ], + "thunderbird": [ + "08c73485-7c6d-4681-999d-919f5c32dcfa" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "efcf0d81-0835-4880-b2fd-d866e8bc2294", + "9195653c-f4aa-453d-aa95-787f6ccfaae9", + "215dfd39-f493-4bc3-a027-8a97d72c61bf" + ], + "vs_code": [ + "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", + "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae" + ] +} \ No newline at end of file diff --git a/lib_run_single.py b/lib_run_single.py index fd69583..4256bdc 100644 --- a/lib_run_single.py +++ b/lib_run_single.py @@ -63,6 +63,33 @@ def setup_logger(example, example_result_dir): runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log"))) return runtime_logger +def run_single_example_human(env, example, max_steps, instruction, args, example_result_dir, scores): + runtime_logger = setup_logger(example, example_result_dir) + env.reset(task_config=example) + time.sleep(60) # Wait for the environment to be ready + obs = env._get_obs() # Get the initial observation + + # Save initial screenshot + with open(os.path.join(example_result_dir, "initial_state.png"), "wb") as _f: + _f.write(obs['screenshot']) + + # Save trajectory information + with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: + f.write(json.dumps({ + "instruction": instruction, + "initial_state": "initial_state.png" + })) + f.write("\n") + + # Evaluate the result + result = env.evaluate() + logger.info("Result: %.2f", result) + scores.append(result) + with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f: + f.write(f"{result}\n") + + + def run_single_example_openaicua(agent, env, example, max_steps, instruction, args, example_result_dir, scores): runtime_logger = setup_logger(example, example_result_dir) agent.reset(runtime_logger) diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 4e95cef..e22c858 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -8,6 +8,7 @@ from io import BytesIO from typing import Dict, List from PIL import Image +from openai import OpenAI, APIError, RateLimitError, Timeout from typing import Any, Optional, Union, Tuple from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \ @@ -35,7 +36,7 @@ from typing import Dict, Any, Optional, Union OPERATOR_PROMPT = """Here are some helpful tips: (1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled. (2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing. -(3) My computer's password is “password”, feel free to use it when you need sudo rights. +(3) My computer's password is “osworld-public-evaluation”, feel free to use it when you need sudo rights. (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”. (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one. (6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation. @@ -299,9 +300,10 @@ class OpenAICUAAgent: Raises: requests.exceptions.RequestException: If the API request fails """ - while True: + MAX_RETRIES = 100 + retry_count = 0 + while retry_count < MAX_RETRIES: try: - from openai import OpenAI client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA")) response = client.responses.create( model=self.model, @@ -317,6 +319,7 @@ class OpenAICUAAgent: return response except Exception as e: logger.error(f"OpenAI API error: {str(e)}") + print(f"OpenAI API error: {str(e)}") new_screenshot = self.env._get_obs() new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8') @@ -335,7 +338,9 @@ class OpenAICUAAgent: logger.warning("Unknown message structure, cannot update screenshot") retry_count += 1 - time.sleep(1) + time.sleep(min(30, 2 ** retry_count)) + logger.critical("Max retries exceeded for OpenAI API") + raise RuntimeError("OpenAI API failed too many times") def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]: """Parse a response item from the OpenAI API. diff --git a/monitor/.env b/monitor/.env index 3984b1b..9b22450 100644 --- a/monitor/.env +++ b/monitor/.env @@ -2,9 +2,9 @@ # Do not write any secret keys or sensitive information here. # Monitor configuration -TASK_CONFIG_PATH=../evaluation_examples/test_all.json +TASK_CONFIG_PATH=../evaluation_examples/test_rest0608.json EXAMPLES_BASE_PATH=../evaluation_examples/examples -RESULTS_BASE_PATH=../results_all_ifmessage_promptnochange +RESULTS_BASE_PATH=../results_all_ifnoaction_promptnochange_adderror_error2 ACTION_SPACE=pyautogui OBSERVATION_TYPE=screenshot MODEL_NAME=computer-use-preview diff --git a/run_human.sh b/run_human.sh new file mode 100644 index 0000000..90a7564 --- /dev/null +++ b/run_human.sh @@ -0,0 +1,8 @@ +python run_multienv_human.py \ + --headless \ + --observation_type screenshot \ + --result_dir ./results_human_impress \ + --test_all_meta_path evaluation_examples/test_impress.json \ + --region us-east-1 \ + --max_steps 3 \ + --num_envs 1 diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index 195ea6c..0bed267 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -257,7 +257,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: f.write( json.dumps( - {"Error": f"Time limit exceeded in {domain}/{example_id}"} + {"Error": f"{domain}/{example_id} - {e}"} ) ) f.write("\n") @@ -442,7 +442,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file if __name__ == "__main__": ####### The complete version of the list of examples ####### os.environ["TOKENIZERS_PARALLELISM"] = "false" - + # Register signal handlers for graceful termination signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal diff --git a/run_operator.sh b/run_operator.sh index b054a78..668d7c7 100644 --- a/run_operator.sh +++ b/run_operator.sh @@ -2,8 +2,8 @@ python run_multienv_openaicua.py \ --headless \ --observation_type screenshot \ --model computer-use-preview \ ---result_dir ./results_all_ifmessage_promptnochange \ ---test_all_meta_path evaluation_examples/test_all.json \ +--result_dir ./results_all_ifnoaction_promptnochange_adderror_error2 \ +--test_all_meta_path evaluation_examples/test_rest0608.json \ --region us-east-1 \ --max_steps 150 \ ---num_envs 25 +--num_envs 1 diff --git a/run_operator_impress.sh b/run_operator_impress.sh new file mode 100644 index 0000000..468fd50 --- /dev/null +++ b/run_operator_impress.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ + --headless \ + --observation_type screenshot \ + --model computer-use-preview \ + --result_dir ./results_vlc_retest \ + --test_all_meta_path evaluation_examples/test_impress.json \ + --region us-east-1 \ + --max_steps 3 \ + --num_envs 1