edit prompt

2025-06-07 05:21:04 +00:00
parent 64177045b5
commit a146c1e0b7
6 changed files with 32 additions and 7 deletions
--- a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json
+++ b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json
@@ -271,5 +271,5 @@
      }
    }
  },
-  "proxy": false
+  "proxy": true
 }
--- a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json
+++ b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json
@@ -70,5 +70,5 @@
      "dest": "book_list_result.docx"
    }
  },
-  "proxy": false
+  "proxy": true
 }
--- a/evaluation_examples/test_prompt_message.json
+++ b/evaluation_examples/test_prompt_message.json
@@ -0,0 +1,11 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "47543840-672a-467d-80df-8f7c3b9788c9"
+    ],
+    "multi_apps": [
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+    ]
+}
--- a/evaluation_examples/test_proxy.json
+++ b/evaluation_examples/test_proxy.json
@@ -0,0 +1,11 @@
+{
+  "chrome": [
+    "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "47543840-672a-467d-80df-8f7c3b9788c9"
+    ],
+    "multi_apps": [
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+    ]
+}
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -38,8 +38,9 @@ OPERATOR_PROMPT = """Here are some helpful tips:
 (3) My computer's password is “password”, feel free to use it when you need sudo rights.
 (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
 (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
-(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
-(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”."""
+(6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.
+(7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. Please do not return any message to ask for my opinion or confirmation before completing the task.
+"""


 class Action:
@@ -679,7 +680,9 @@ class OpenAICUAAgent:
        state_correct = False
        # if action_exit and thought_exit:
        #     state_correct = True
-        if action_exit and not message_exit:   
+        #if action_exit and not message_exit:   
+        #    state_correct = True
+        if action_exit:
            state_correct = True
        if not state_correct:
            logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,7 +2,7 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
--result_dir ./results_all_error \
--test_all_meta_path evaluation_examples/test_all_error.json \
+--result_dir ./results_proxy \
+--test_all_meta_path evaluation_examples/test_proxy.json \
 --region us-east-1 \
 --max_steps 150