fix error

2025-06-09 04:20:59 +00:00
parent 3e541bb393
commit aee1207fff
10 changed files with 142 additions and 11 deletions
--- a/evaluation_examples/test_bug_0608.json
+++ b/evaluation_examples/test_bug_0608.json
@@ -0,0 +1,11 @@
+{
+  "libreoffice_writer": [
+    "0b17a146-2934-46c7-8727-73ff6b6483e8"
+  ],
+  "libreoffice_calc": [
+    "1954cced-e748-45c4-9c26-9855b97fbc5e"
+  ],
+    "vlc": [
+    "fba2c100-79e8-42df-ae74-b592418d54f4"
+  ]
+}
--- a/evaluation_examples/test_impress.json
+++ b/evaluation_examples/test_impress.json
@@ -0,0 +1,5 @@
+{
+    "libreoffice_impress": [
+        "0a211154-fda0-48d0-9274-eaac4ce5486d"
+    ]
+}
--- a/evaluation_examples/test_rest0608.json
+++ b/evaluation_examples/test_rest0608.json
@@ -0,0 +1,66 @@
+{
+  "chrome": [
+    "1704f00f-79e6-43a7-961b-cedd3724d5fd",
+    "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
+    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
+  ],
+  "gimp": [
+    "62f7fd55-0687-4a43-b6e1-3eda16fc6252"
+  ],
+  "libreoffice_calc": [
+    "1954cced-e748-45c4-9c26-9855b97fbc5e",
+    "035f41ba-6653-43ab-aa63-c86d449d62e5",
+    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
+    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
+    "21ab7b40-77c2-4ae6-8321-e00d3a086c73"
+  ],
+  "libreoffice_impress": [
+    "5d901039-a89c-4bfb-967b-bf66f4df075e",
+    "15aece23-a215-4579-91b4-69eec72e18da",
+    "a434992a-89df-4577-925c-0c58b747f0f4",
+    "af2d657a-e6b3-4c6a-9f67-9e3ed015974c",
+    "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a",
+    "04578141-1d42-4146-b9cf-6fab4ce5fd74"
+  ],
+  "libreoffice_writer": [
+    "0b17a146-2934-46c7-8727-73ff6b6483e8",
+    "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
+    "ecc2413d-8a48-416e-a3a2-d30106ca36cb"
+  ],
+  "multi_apps": [
+    "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
+    "b52b40a5-ad70-4c53-b5b0-5650a8387052",
+    "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
+    "337d318b-aa07-4f4f-b763-89d9a2dd013f",
+    "869de13e-bef9-4b91-ba51-f6708c40b096",
+    "3a93cae4-ad3e-403e-8c12-65303b271818",
+    "9219480b-3aed-47fc-8bac-d2cffc5849f7",
+    "7e287123-70ca-47b9-8521-47db09b69b14",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "873cafdd-a581-47f6-8b33-b9696ddb7b05",
+    "b337d106-053f-4d37-8da0-7f9c4043a66b",
+    "20236825-b5df-46e7-89bf-62e1d640a897",
+    "02ce9a50-7af2-47ed-8596-af0c230501f8",
+    "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "f5c13cdd-205c-4719-a562-348ae5cd1d91",
+    "7ff48d5b-2df2-49da-b500-a5150ffc7f18",
+    "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56",
+    "788b3701-3ec9-4b67-b679-418bfa726c22",
+    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
+    "aceb0368-56b8-4073-b70e-3dc9aee184e0",
+    "22a4636f-8179-4357-8e87-d1743ece1f81"
+  ],
+  "thunderbird": [
+    "08c73485-7c6d-4681-999d-919f5c32dcfa"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "efcf0d81-0835-4880-b2fd-d866e8bc2294",
+    "9195653c-f4aa-453d-aa95-787f6ccfaae9",
+    "215dfd39-f493-4bc3-a027-8a97d72c61bf"
+  ],
+  "vs_code": [
+    "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae"
+  ]
+}
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -63,6 +63,33 @@ def setup_logger(example, example_result_dir):
    runtime_logger.addHandler(logging.FileHandler(os.path.join(example_result_dir, "runtime.log")))
    return runtime_logger

+def run_single_example_human(env, example, max_steps, instruction, args, example_result_dir, scores):
+    runtime_logger = setup_logger(example, example_result_dir)
+    env.reset(task_config=example)
+    time.sleep(60) # Wait for the environment to be ready
+    obs = env._get_obs() # Get the initial observation
+    
+    # Save initial screenshot
+    with open(os.path.join(example_result_dir, "initial_state.png"), "wb") as _f:
+        _f.write(obs['screenshot'])
+    
+    # Save trajectory information
+    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+        f.write(json.dumps({
+            "instruction": instruction,
+            "initial_state": "initial_state.png"
+        }))
+        f.write("\n")
+    
+    # Evaluate the result
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+
+
+
 def run_single_example_openaicua(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
    runtime_logger = setup_logger(example, example_result_dir)
    agent.reset(runtime_logger)
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -8,6 +8,7 @@ from io import BytesIO
 from typing import Dict, List

 from PIL import Image
+from openai import OpenAI, APIError, RateLimitError, Timeout
 from typing import Any, Optional, Union, Tuple

 from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
@@ -35,7 +36,7 @@ from typing import Dict, Any, Optional, Union
 OPERATOR_PROMPT = """Here are some helpful tips:
 (1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled.
 (2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing.
-(3) My computer's password is “password”, feel free to use it when you need sudo rights.
+(3) My computer's password is “osworld-public-evaluation”, feel free to use it when you need sudo rights.
 (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
 (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
 (6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
@@ -299,9 +300,10 @@ class OpenAICUAAgent:
        Raises:
            requests.exceptions.RequestException: If the API request fails
        """
-        while True:
+        MAX_RETRIES = 100
+        retry_count = 0
+        while retry_count < MAX_RETRIES:
            try:
-                from openai import OpenAI
                client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA"))
                response = client.responses.create(
                    model=self.model,
@@ -317,6 +319,7 @@ class OpenAICUAAgent:
                return response
            except Exception as e:
                logger.error(f"OpenAI API error: {str(e)}")
+                print(f"OpenAI API error: {str(e)}")
                new_screenshot = self.env._get_obs()
                new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8')
                
@@ -335,7 +338,9 @@ class OpenAICUAAgent:
                    logger.warning("Unknown message structure, cannot update screenshot")
                
                retry_count += 1
-                time.sleep(1)
+                time.sleep(min(30, 2 ** retry_count))
+        logger.critical("Max retries exceeded for OpenAI API")
+        raise RuntimeError("OpenAI API failed too many times")
    
    def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]:
        """Parse a response item from the OpenAI API.
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.

 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_all.json
+TASK_CONFIG_PATH=../evaluation_examples/test_rest0608.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_all_ifmessage_promptnochange
+RESULTS_BASE_PATH=../results_all_ifnoaction_promptnochange_adderror_error2
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
--- a/run_human.sh
+++ b/run_human.sh
@@ -0,0 +1,8 @@
+python run_multienv_human.py \
+    --headless \
+    --observation_type screenshot \
+    --result_dir ./results_human_impress \
+    --test_all_meta_path evaluation_examples/test_impress.json \
+    --region us-east-1 \
+    --max_steps 3 \
+    --num_envs 1
--- a/run_multienv_openaicua.py
+++ b/run_multienv_openaicua.py
@@ -257,7 +257,7 @@ def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, share
                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                        f.write(
                            json.dumps(
-                                {"Error": f"Time limit exceeded in {domain}/{example_id}"}
+                                {"Error": f"{domain}/{example_id} - {e}"}
                            )
                        )
                        f.write("\n")
@@ -442,7 +442,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file
 if __name__ == "__main__":
    ####### The complete version of the list of examples #######
    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-   
+    
    # Register signal handlers for graceful termination
    signal.signal(signal.SIGINT, signal_handler)  # Handle Ctrl+C
    signal.signal(signal.SIGTERM, signal_handler)  # Handle termination signal
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,8 +2,8 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
--result_dir ./results_all_ifmessage_promptnochange \
--test_all_meta_path evaluation_examples/test_all.json \
+--result_dir ./results_all_ifnoaction_promptnochange_adderror_error2 \
+--test_all_meta_path evaluation_examples/test_rest0608.json \
 --region us-east-1 \
 --max_steps 150 \
--num_envs 25 
+--num_envs 1
--- a/run_operator_impress.sh
+++ b/run_operator_impress.sh
@@ -0,0 +1,9 @@
+python run_multienv_openaicua.py \
+    --headless \
+    --observation_type screenshot \
+    --model computer-use-preview \
+    --result_dir ./results_vlc_retest \
+    --test_all_meta_path evaluation_examples/test_impress.json \
+    --region us-east-1 \
+    --max_steps 3 \
+    --num_envs 1