From b87cbe69e5d81e4bdb32a0e9ef0edd53cf1c9485 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Mon, 2 Jun 2025 13:34:20 +0000
Subject: [PATCH 1/7] add monitor

---
 monitor/.env | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/monitor/.env b/monitor/.env
index d28ba42..70eb212 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.
 
 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_small.json
+TASK_CONFIG_PATH=../evaluation_examples/test_small_debug.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_operator_aws/pyautogui/screenshot/computer-use-preview
+RESULTS_BASE_PATH=../results_operator_aws2/pyautogui/screenshot/computer-use-preview
 MAX_STEPS=50
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0

From b211df33856ac43591b4acbfc69849ecbc0e9fd1 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Wed, 4 Jun 2025 10:23:45 +0000
Subject: [PATCH 2/7] fix timeout

---
 evaluation_examples/test_bug.json         |  5 +++++
 evaluation_examples/test_small_test.json  | 25 +++++++++++++++++++++++
 evaluation_examples/test_small_test2.json | 18 ++++++++++++++++
 mm_agents/openai_cua_agent.py             | 10 ++-------
 monitor/.env                              |  6 +++---
 run_operator.sh                           |  7 +++----
 6 files changed, 56 insertions(+), 15 deletions(-)
 create mode 100644 evaluation_examples/test_bug.json
 create mode 100644 evaluation_examples/test_small_test.json
 create mode 100644 evaluation_examples/test_small_test2.json

diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json
new file mode 100644
index 0000000..8f2e521
--- /dev/null
+++ b/evaluation_examples/test_bug.json
@@ -0,0 +1,5 @@
+{
+  "multi_apps": [
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_small_test.json b/evaluation_examples/test_small_test.json
new file mode 100644
index 0000000..644ea9a
--- /dev/null
+++ b/evaluation_examples/test_small_test.json
@@ -0,0 +1,25 @@
+{
+  "libreoffice_writer": [
+    "0810415c-bde4-4443-9047-d5f70165a697",
+    "0a0faba3-5580-44df-965d-f562a99b291c"
+  ],
+  "multi_apps": [
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618",
+    "b5062e3e-641c-4e3a-907b-ac864d2e7652",
+    "716a6079-22da-47f1-ba73-c9d58f986a38"
+  ],
+  "os": [
+    "5812b315-e7bd-4265-b51f-863c02174c28"
+  ],
+  "thunderbird": [
+    "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397",
+    "15c3b339-88f7-4a86-ab16-e71c58dcb01e"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "8f080098-ddb1-424c-b438-4e96e5e4786e"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json
new file mode 100644
index 0000000..d7ea04b
--- /dev/null
+++ b/evaluation_examples/test_small_test2.json
@@ -0,0 +1,18 @@
+{
+  "multi_apps": [
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618"
+  ],
+  "os": [
+    "5812b315-e7bd-4265-b51f-863c02174c28"
+  ],
+  "thunderbird": [
+    "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397",
+    "15c3b339-88f7-4a86-ab16-e71c58dcb01e"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967"
+  ]
+}
\ No newline at end of file
diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py
index e930091..34098dc 100644
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -301,8 +301,7 @@ class OpenAICUAAgent:
         Raises:
             requests.exceptions.RequestException: If the API request fails
         """
-        retry_count = 0
-        while retry_count < 3:
+        while True:
             try:
                 from openai import OpenAI
                 client = OpenAI(api_key=os.getenv("OPENAI_API_KEY_CUA"))
@@ -319,13 +318,8 @@ class OpenAICUAAgent:
                 logger.info(f"Response: {response}")
                 return response
             except Exception as e:
-                logger.error(f"OpenAI API error: {str(e)}")
-                new_screenshot = self.env._get_obs()
-                new_screenshot_base64 = base64.b64encode(new_screenshot["screenshot"]).decode('utf-8')
-                self.cua_messages[-1]["output"]["image_url"] = f"data:image/png;base64,{new_screenshot_base64}"
-                retry_count += 1
+                logger.error(f"OpenAI API error: {str(e)}，will retry in 1s...")
                 time.sleep(1)
-        raise Exception("Failed to make OpenAI API call after 3 retries")
     
     def _handle_item(self, item: Dict[str, Any]) -> Optional[Union[str, Dict[str, Any]]]:
         """Parse a response item from the OpenAI API.
diff --git a/monitor/.env b/monitor/.env
index 70eb212..a55e5c7 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,10 +2,10 @@
 # Do not write any secret keys or sensitive information here.
 
 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_small_debug.json
+TASK_CONFIG_PATH=../evaluation_examples/test_small_test2.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_operator_aws2/pyautogui/screenshot/computer-use-preview
-MAX_STEPS=50
+RESULTS_BASE_PATH=../results_operator_timeoutcheck3/pyautogui/screenshot/computer-use-preview
+MAX_STEPS=150
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
 FLASK_DEBUG=true
\ No newline at end of file
diff --git a/run_operator.sh b/run_operator.sh
index 9a84e92..72d187b 100644
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,8 +2,7 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
---result_dir ./results_operator_aws_new \
---test_all_meta_path evaluation_examples/test_small_debug.json \
+--result_dir ./results_operator_timeoutcheck3 \
+--test_all_meta_path evaluation_examples/test_small_test2.json \
 --region us-east-1 \
---max_steps 150 \
---num_envs 5
+--max_steps 150

From 71578d994e21b67e586110db4573e3b304fb1936 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Thu, 5 Jun 2025 13:29:16 +0000
Subject: [PATCH 3/7] edit

---
 evaluation_examples/test_bug.json              |  4 ++--
 evaluation_examples/test_small_test2 copy.json | 18 ++++++++++++++++++
 evaluation_examples/test_small_test2.json      | 13 -------------
 3 files changed, 20 insertions(+), 15 deletions(-)
 create mode 100644 evaluation_examples/test_small_test2 copy.json

diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json
index 8f2e521..5f5d9ed 100644
--- a/evaluation_examples/test_bug.json
+++ b/evaluation_examples/test_bug.json
@@ -1,5 +1,5 @@
 {
-  "multi_apps": [
-    "46407397-a7d5-4c6b-92c6-dbe038b1457b"
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
   ]
 }
\ No newline at end of file
diff --git a/evaluation_examples/test_small_test2 copy.json b/evaluation_examples/test_small_test2 copy.json
new file mode 100644
index 0000000..d7ea04b
--- /dev/null
+++ b/evaluation_examples/test_small_test2 copy.json	
@@ -0,0 +1,18 @@
+{
+  "multi_apps": [
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618"
+  ],
+  "os": [
+    "5812b315-e7bd-4265-b51f-863c02174c28"
+  ],
+  "thunderbird": [
+    "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397",
+    "15c3b339-88f7-4a86-ab16-e71c58dcb01e"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_small_test2.json b/evaluation_examples/test_small_test2.json
index d7ea04b..b08581a 100644
--- a/evaluation_examples/test_small_test2.json
+++ b/evaluation_examples/test_small_test2.json
@@ -1,17 +1,4 @@
 {
-  "multi_apps": [
-    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-    "c867c42d-a52d-4a24-8ae3-f75d256b5618"
-  ],
-  "os": [
-    "5812b315-e7bd-4265-b51f-863c02174c28"
-  ],
-  "thunderbird": [
-    "dfac9ee8-9bc4-4cdc-b465-4a4bfcd2f397",
-    "15c3b339-88f7-4a86-ab16-e71c58dcb01e"
-  ],
   "vlc": [
     "59f21cfb-0120-4326-b255-a5b827b38967"
   ]

From 4ea24ddfd31fd74ef3ee194179a533b80c96a0b4 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Fri, 6 Jun 2025 09:41:22 +0000
Subject: [PATCH 4/7] add proxy

---
 dataimpulse_proxy_config.json                 | 13 ++++
 .../providers/aws/provider_with_proxy.py      | 76 +++++++++----------
 .../d1acdb87-bb67-4f30-84aa-990e56a09c92.json |  2 +-
 evaluation_examples/test_all_error.json       | 38 ++++++++++
 evaluation_examples/test_all_error2.json      | 14 ++++
 evaluation_examples/test_bug.json             |  5 +-
 monitor/.env                                  |  4 +-
 run_operator.sh                               |  4 +-
 8 files changed, 112 insertions(+), 44 deletions(-)
 create mode 100644 dataimpulse_proxy_config.json
 create mode 100644 evaluation_examples/test_all_error.json
 create mode 100644 evaluation_examples/test_all_error2.json

diff --git a/dataimpulse_proxy_config.json b/dataimpulse_proxy_config.json
new file mode 100644
index 0000000..596e171
--- /dev/null
+++ b/dataimpulse_proxy_config.json
@@ -0,0 +1,13 @@
+[
+    {
+        "host": "gw.dataimpulse.com",
+        "port": 823,
+        "username": "fba5ac061fe18be70c6c",
+        "password": "3b5669b6640fc80c",
+        "protocol": "http",
+        "provider": "dataimpulse",
+        "type": "residential",
+        "country": "US",
+        "note": "Dataimpulse Residential Proxy --- Tianbao Xie"
+    }
+] 
\ No newline at end of file
diff --git a/desktop_env/providers/aws/provider_with_proxy.py b/desktop_env/providers/aws/provider_with_proxy.py
index 309e71b..0f5860a 100644
--- a/desktop_env/providers/aws/provider_with_proxy.py
+++ b/desktop_env/providers/aws/provider_with_proxy.py
@@ -47,48 +47,48 @@ class AWSProviderWithProxy(Provider):
         proxy_url = self._format_proxy_url(self.current_proxy)
         
         user_data_script = f"""#!/bin/bash
-# 配置系统代理
-echo 'export http_proxy={proxy_url}' >> /etc/environment
-echo 'export https_proxy={proxy_url}' >> /etc/environment
-echo 'export HTTP_PROXY={proxy_url}' >> /etc/environment  
-echo 'export HTTPS_PROXY={proxy_url}' >> /etc/environment
+            # 配置系统代理
+            echo 'export http_proxy={proxy_url}' >> /etc/environment
+            echo 'export https_proxy={proxy_url}' >> /etc/environment
+            echo 'export HTTP_PROXY={proxy_url}' >> /etc/environment  
+            echo 'export HTTPS_PROXY={proxy_url}' >> /etc/environment
 
-# 配置apt代理
-cat > /etc/apt/apt.conf.d/95proxy << EOF
-Acquire::http::Proxy "{proxy_url}";
-Acquire::https::Proxy "{proxy_url}";
-EOF
+            # 配置apt代理
+            cat > /etc/apt/apt.conf.d/95proxy << EOF
+            Acquire::http::Proxy "{proxy_url}";
+            Acquire::https::Proxy "{proxy_url}";
+            EOF
 
-# 配置chrome/chromium代理
-mkdir -p /etc/opt/chrome/policies/managed
-cat > /etc/opt/chrome/policies/managed/proxy.json << EOF
-{{
-    "ProxyMode": "fixed_servers",
-    "ProxyServer": "{self.current_proxy.host}:{self.current_proxy.port}"
-}}
-EOF
+            # 配置chrome/chromium代理
+            mkdir -p /etc/opt/chrome/policies/managed
+            cat > /etc/opt/chrome/policies/managed/proxy.json << EOF
+            {{
+                "ProxyMode": "fixed_servers",
+                "ProxyServer": "{self.current_proxy.host}:{self.current_proxy.port}"
+            }}
+            EOF
 
-# 配置firefox代理
-mkdir -p /etc/firefox/policies
-cat > /etc/firefox/policies/policies.json << EOF
-{{
-    "policies": {{
-        "Proxy": {{
-            "Mode": "manual",
-            "HTTPProxy": "{self.current_proxy.host}:{self.current_proxy.port}",
-            "HTTPSProxy": "{self.current_proxy.host}:{self.current_proxy.port}",
-            "UseHTTPProxyForAllProtocols": true
-        }}
-    }}
-}}
-EOF
+            # 配置firefox代理
+            mkdir -p /etc/firefox/policies
+            cat > /etc/firefox/policies/policies.json << EOF
+            {{
+                "policies": {{
+                    "Proxy": {{
+                        "Mode": "manual",
+                        "HTTPProxy": "{self.current_proxy.host}:{self.current_proxy.port}",
+                        "HTTPSProxy": "{self.current_proxy.host}:{self.current_proxy.port}",
+                        "UseHTTPProxyForAllProtocols": true
+                    }}
+                }}
+            }}
+            EOF
 
-# 重新加载环境变量
-source /etc/environment
+            # 重新加载环境变量
+            source /etc/environment
 
-# 记录代理配置日志
-echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.port}" >> /var/log/proxy-setup.log
-"""
+            # 记录代理配置日志
+            echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.port}" >> /var/log/proxy-setup.log
+            """
         
         return base64.b64encode(user_data_script.encode()).decode()
 
@@ -99,7 +99,7 @@ echo "$(date): Configured proxy {self.current_proxy.host}:{self.current_proxy.po
         else:
             return f"{proxy.protocol}://{proxy.host}:{proxy.port}"
 
-    def start_emulator(self, path_to_vm: str, headless: bool):
+    def start_emulator(self, path_to_vm: str, headless: bool, os_type: str):
         logger.info("Starting AWS VM with proxy configuration...")
         ec2_client = boto3.client('ec2', region_name=self.region)
 
diff --git a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json
index 8e42989..91c7530 100644
--- a/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json
+++ b/evaluation_examples/examples/multi_apps/d1acdb87-bb67-4f30-84aa-990e56a09c92.json
@@ -139,5 +139,5 @@
       ]
     }
   },
-  "proxy": false
+  "proxy": true
 }
\ No newline at end of file
diff --git a/evaluation_examples/test_all_error.json b/evaluation_examples/test_all_error.json
new file mode 100644
index 0000000..fe33a13
--- /dev/null
+++ b/evaluation_examples/test_all_error.json
@@ -0,0 +1,38 @@
+{
+  "chrome": [
+    "59155008-fe71-45ec-8a8f-dc35497b6aa8",
+    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
+  ],
+  "libreoffice_calc": [
+    "2bd59342-0664-4ccb-ba87-79379096cc08",
+    "7efeb4b1-3d19-4762-b163-63328d66303b",
+    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14"
+  ],
+  "libreoffice_impress": [
+    "455d3c66-7dc6-4537-a39a-36d3e9119df7",
+    "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
+    "bf4e9888-f10f-47af-8dba-76413038b73c",
+    "e4ef0baf-4b52-4590-a47e-d4d464cca2d7"
+  ],
+  "multi_apps": [
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "78aed49a-a710-4321-a793-b611a7c5b56b",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
+    "b52b40a5-ad70-4c53-b5b0-5650a8387052",
+    "0c825995-5b70-4526-b663-113f4c999dd2",
+    "337d318b-aa07-4f4f-b763-89d9a2dd013f",
+    "9219480b-3aed-47fc-8bac-d2cffc5849f7",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "22a4636f-8179-4357-8e87-d1743ece1f81"
+  ],
+  "os": [
+    "a462a795-fdc7-4b23-b689-e8b6df786b78"
+  ],
+  "vlc": [
+    "8f080098-ddb1-424c-b438-4e96e5e4786e",
+    "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
+    "7882ed6e-bece-4bf0-bada-c32dc1ddae72"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_all_error2.json b/evaluation_examples/test_all_error2.json
new file mode 100644
index 0000000..4156b91
--- /dev/null
+++ b/evaluation_examples/test_all_error2.json
@@ -0,0 +1,14 @@
+{
+  "libreoffice_calc": [
+    "7efeb4b1-3d19-4762-b163-63328d66303b"
+  ],
+  "libreoffice_impress": [
+    "455d3c66-7dc6-4537-a39a-36d3e9119df7",
+    "3b27600c-3668-4abd-8f84-7bcdebbccbdb"
+  ],
+  "multi_apps": [
+    "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
+    "337d318b-aa07-4f4f-b763-89d9a2dd013f",
+    "e2392362-125e-4f76-a2ee-524b183a3412"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json
index 5f5d9ed..da94c93 100644
--- a/evaluation_examples/test_bug.json
+++ b/evaluation_examples/test_bug.json
@@ -1,5 +1,8 @@
 {
   "chrome": [
-    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
+    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc"
+  ],
+  "multi_apps": [
+    "d1acdb87-bb67-4f30-84aa-990e56a09c92"
   ]
 }
\ No newline at end of file
diff --git a/monitor/.env b/monitor/.env
index 0333301..3851fc6 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.
 
 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_small_test2.json
+TASK_CONFIG_PATH=../evaluation_examples/test_all_error.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_operator_timeoutcheck3/pyautogui/screenshot/computer-use-preview
+RESULTS_BASE_PATH=../results_all_error/pyautogui/screenshot/computer-use-preview
 MAX_STEPS=150
 FLASK_PORT=80
 FLASK_HOST=0.0.0.0
diff --git a/run_operator.sh b/run_operator.sh
index 72d187b..b6a3750 100644
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,7 +2,7 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
---result_dir ./results_operator_timeoutcheck3 \
---test_all_meta_path evaluation_examples/test_small_test2.json \
+--result_dir ./results_all_error \
+--test_all_meta_path evaluation_examples/test_all_error.json \
 --region us-east-1 \
 --max_steps 150

From a146c1e0b7789a2d8f03cb2d10eb23789a1908dc Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sat, 7 Jun 2025 05:21:04 +0000
Subject: [PATCH 5/7] edit prompt

---
 .../chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json  |  2 +-
 .../da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json         |  2 +-
 evaluation_examples/test_prompt_message.json          | 11 +++++++++++
 evaluation_examples/test_proxy.json                   | 11 +++++++++++
 mm_agents/openai_cua_agent.py                         |  9 ++++++---
 run_operator.sh                                       |  4 ++--
 6 files changed, 32 insertions(+), 7 deletions(-)
 create mode 100644 evaluation_examples/test_prompt_message.json
 create mode 100644 evaluation_examples/test_proxy.json

diff --git a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json
index a569c9d..254244b 100644
--- a/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json
+++ b/evaluation_examples/examples/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938.json
@@ -271,5 +271,5 @@
       }
     }
   },
-  "proxy": false
+  "proxy": true
 }
\ No newline at end of file
diff --git a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json
index d56f64f..7b511db 100644
--- a/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json
+++ b/evaluation_examples/examples/multi_apps/da52d699-e8d2-4dc5-9191-a2199e0b6a9b.json
@@ -70,5 +70,5 @@
       "dest": "book_list_result.docx"
     }
   },
-  "proxy": false
+  "proxy": true
 }
\ No newline at end of file
diff --git a/evaluation_examples/test_prompt_message.json b/evaluation_examples/test_prompt_message.json
new file mode 100644
index 0000000..5131b1d
--- /dev/null
+++ b/evaluation_examples/test_prompt_message.json
@@ -0,0 +1,11 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "47543840-672a-467d-80df-8f7c3b9788c9"
+    ],
+    "multi_apps": [
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+    ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_proxy.json b/evaluation_examples/test_proxy.json
new file mode 100644
index 0000000..96415e3
--- /dev/null
+++ b/evaluation_examples/test_proxy.json
@@ -0,0 +1,11 @@
+{
+  "chrome": [
+    "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "47543840-672a-467d-80df-8f7c3b9788c9"
+    ],
+    "multi_apps": [
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+    ]
+}
\ No newline at end of file
diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py
index f3b8a65..09e9657 100644
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -38,8 +38,9 @@ OPERATOR_PROMPT = """Here are some helpful tips:
 (3) My computer's password is “password”, feel free to use it when you need sudo rights.
 (4) For the thunderbird account “anonym-x2024@outlook.com”, the password is “gTCI”;=@y7—QJ0nDa_kN3Sb¿”.
 (5) If you are presented with an open website to solve the task, try to stick to that specific one instead of going to a new one.
-(6) You have full authority to execute any action without my permission. I won't be watching so please don't ask for confirmation.
-(7) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”."""
+(6) If you deem the task is infeasible, you can terminate and explicitly state in the response that “the task is infeasible”.
+(7) THIS IS VERY, VERY, VERY IMPORTANT!!!!! You have full authority to execute any action without my permission. Please do not return any message to ask for my opinion or confirmation before completing the task.
+"""
 
 
 class Action:
@@ -679,7 +680,9 @@ class OpenAICUAAgent:
         state_correct = False
         # if action_exit and thought_exit:
         #     state_correct = True
-        if action_exit and not message_exit:   
+        #if action_exit and not message_exit:   
+        #    state_correct = True
+        if action_exit:
             state_correct = True
         if not state_correct:
             logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
diff --git a/run_operator.sh b/run_operator.sh
index b6a3750..f46d17f 100644
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,7 +2,7 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
---result_dir ./results_all_error \
---test_all_meta_path evaluation_examples/test_all_error.json \
+--result_dir ./results_proxy \
+--test_all_meta_path evaluation_examples/test_proxy.json \
 --region us-east-1 \
 --max_steps 150

From e61acece8444c717918e6733586c32d7be6c2f2b Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sat, 7 Jun 2025 05:30:40 +0000
Subject: [PATCH 6/7] problems from the community

---
 evaluation_examples/test_bug.json                | 8 --------
 evaluation_examples/test_bug_from_community.json | 5 +++++
 2 files changed, 5 insertions(+), 8 deletions(-)
 delete mode 100644 evaluation_examples/test_bug.json
 create mode 100644 evaluation_examples/test_bug_from_community.json

diff --git a/evaluation_examples/test_bug.json b/evaluation_examples/test_bug.json
deleted file mode 100644
index da94c93..0000000
--- a/evaluation_examples/test_bug.json
+++ /dev/null
@@ -1,8 +0,0 @@
-{
-  "chrome": [
-    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc"
-  ],
-  "multi_apps": [
-    "d1acdb87-bb67-4f30-84aa-990e56a09c92"
-  ]
-}
\ No newline at end of file
diff --git a/evaluation_examples/test_bug_from_community.json b/evaluation_examples/test_bug_from_community.json
new file mode 100644
index 0000000..d6e5531
--- /dev/null
+++ b/evaluation_examples/test_bug_from_community.json
@@ -0,0 +1,5 @@
+{
+  "vlc": [
+    "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f"
+  ]
+}
\ No newline at end of file

From 4ade4114da95e7f4fffdee99b553695970696804 Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sat, 7 Jun 2025 06:50:15 +0000
Subject: [PATCH 7/7] add problems from the community

---
 run_operator.sh | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/run_operator.sh b/run_operator.sh
index f46d17f..24bc064 100644
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -2,7 +2,8 @@ python run_multienv_openaicua.py \
 --headless \
 --observation_type screenshot \
 --model computer-use-preview \
---result_dir ./results_proxy \
---test_all_meta_path evaluation_examples/test_proxy.json \
+--result_dir ./results_small_retest \
+--test_all_meta_path evaluation_examples/test_small.json \
 --region us-east-1 \
---max_steps 150
+--max_steps 150 \
+--num_envs 10