From 98a810d31eb6eeaada7d5cd42031e8dc1a5de8aa Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Mon, 2 Jun 2025 12:11:25 +0000 Subject: [PATCH] edit operator --- desktop_env/providers/aws/manager.py | 3 ++- .../test_small_debug copy.json | 27 +++++++++++++++++++ evaluation_examples/test_small_debug.json | 27 +++++++++++++++++++ mm_agents/openai_cua_agent.py | 12 ++++++++- run_multienv_openaicua.py | 6 ++--- run_operator.sh | 9 +++++++ 6 files changed, 79 insertions(+), 5 deletions(-) create mode 100644 evaluation_examples/test_small_debug copy.json create mode 100644 evaluation_examples/test_small_debug.json create mode 100644 run_operator.sh diff --git a/desktop_env/providers/aws/manager.py b/desktop_env/providers/aws/manager.py index 6e8d6de..632c1b7 100644 --- a/desktop_env/providers/aws/manager.py +++ b/desktop_env/providers/aws/manager.py @@ -25,8 +25,9 @@ REGISTRY_PATH = '.aws_vms' DEFAULT_REGION = "us-east-1" # todo: Add doc for the configuration of image, security group and network interface # todo: public the AMI images +# ami-05e7d7bd279ea4f14 IMAGE_ID_MAP = { - "us-east-1": "ami-05e7d7bd279ea4f14", + "us-east-1": "ami-02fea2e5b77c79c17", "ap-east-1": "ami-0c092a5b8be4116f5", } diff --git a/evaluation_examples/test_small_debug copy.json b/evaluation_examples/test_small_debug copy.json new file mode 100644 index 0000000..eeb2555 --- /dev/null +++ b/evaluation_examples/test_small_debug copy.json @@ -0,0 +1,27 @@ +{ + "multi_apps": [ + "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "48d05431-6cd5-4e76-82eb-12b60d823f7d", + "eb303e01-261e-4972-8c07-c9b4e7a4922a", + "d1acdb87-bb67-4f30-84aa-990e56a09c92", + "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", + "8e116af7-7db7-4e35-a68b-b0939c066c78", + "2373b66a-092d-44cb-bfd7-82e86e7a3b4d" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8f080098-ddb1-424c-b438-4e96e5e4786e" + ], + "vs_code": [ + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "276cc624-87ea-4f08-ab93-f770e3790175" + ] +} \ No newline at end of file diff --git a/evaluation_examples/test_small_debug.json b/evaluation_examples/test_small_debug.json new file mode 100644 index 0000000..eeb2555 --- /dev/null +++ b/evaluation_examples/test_small_debug.json @@ -0,0 +1,27 @@ +{ + "multi_apps": [ + "74d5859f-ed66-4d3e-aa0e-93d7a592ce41", + "b5062e3e-641c-4e3a-907b-ac864d2e7652", + "48d05431-6cd5-4e76-82eb-12b60d823f7d", + "eb303e01-261e-4972-8c07-c9b4e7a4922a", + "d1acdb87-bb67-4f30-84aa-990e56a09c92", + "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", + "8e116af7-7db7-4e35-a68b-b0939c066c78", + "2373b66a-092d-44cb-bfd7-82e86e7a3b4d" + ], + "os": [ + "5812b315-e7bd-4265-b51f-863c02174c28" + ], + "thunderbird": [ + "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397", + "15c3b339-88f7-4a86-ab16-e71c58dcb01e" + ], + "vlc": [ + "59f21cfb-0120-4326-b255-a5b827b38967", + "8f080098-ddb1-424c-b438-4e96e5e4786e" + ], + "vs_code": [ + "53ad5833-3455-407b-bbc6-45b4c79ab8fb", + "276cc624-87ea-4f08-ab93-f770e3790175" + ] +} \ No newline at end of file diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py index 24670bc..e930091 100644 --- a/mm_agents/openai_cua_agent.py +++ b/mm_agents/openai_cua_agent.py @@ -45,6 +45,16 @@ class_ns_windows = "https://accessibility.windows.example.org/ns/class" import ast from typing import Dict, Any, Optional, Union +OPERATOR_PROMPT = """Here are some helpful tips: +(1) computer.clipboard, computer.sync_file, computer.sync.shared_folder, computer.computer_output_citation are disabled. +(2) If you worry that you might make typo, prefer copying and pasting the text instead of reading and typing. +(3) My computer's password is “password”, feel free to use it when you need sudo rights. +(4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”. +(5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one. +(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation. +(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.""" + + class Action: """Action class for the agent.""" def __init__(self, raw_action: Union[Dict, str], action_space: str): @@ -639,7 +649,7 @@ class OpenAICUAAgent: }, { "type": "input_text", - "text": instruction + "text": instruction + "\n" + OPERATOR_PROMPT, } ] }) diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py index 342ce91..d717b3a 100644 --- a/run_multienv_openaicua.py +++ b/run_multienv_openaicua.py @@ -145,14 +145,14 @@ def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]: def run_env_tasks(env_idx: int, env_tasks: dict, args: argparse.Namespace, shared_scores: list): """Run tasks for a single environment.""" - + # ami-05e7d7bd279ea4f14 env = DesktopEnv( path_to_vm=args.path_to_vm, action_space=args.action_space, provider_name="aws", region="us-east-1", - snapshot_name="ami-05e7d7bd279ea4f14", + snapshot_name="ami-02fea2e5b77c79c17", screen_size=(args.screen_width, args.screen_height), headless=args.headless, @@ -326,7 +326,7 @@ def get_result(action_space, use_model, observation_type, result_dir, total_file if __name__ == "__main__": ####### The complete version of the list of examples ####### os.environ["TOKENIZERS_PARALLELISM"] = "false" - + args = config() with open(args.test_all_meta_path, "r", encoding="utf-8") as f: diff --git a/run_operator.sh b/run_operator.sh new file mode 100644 index 0000000..9a84e92 --- /dev/null +++ b/run_operator.sh @@ -0,0 +1,9 @@ +python run_multienv_openaicua.py \ +--headless \ +--observation_type screenshot \ +--model computer-use-preview \ +--result_dir ./results_operator_aws_new \ +--test_all_meta_path evaluation_examples/test_small_debug.json \ +--region us-east-1 \ +--max_steps 150 \ +--num_envs 5