From d8872634eed183b8dbd713eca6ce9693f9329b7b Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Sun, 8 Jun 2025 03:59:31 +0000
Subject: [PATCH] edit prompt

---
 mm_agents/openai_cua_agent.py |  8 ++---
 monitor/.env                  |  4 +--
 run_multienv_openaicua.py     | 67 ++++++++++++++++++++---------------
 run_operator.sh               |  2 +-
 4 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/mm_agents/openai_cua_agent.py b/mm_agents/openai_cua_agent.py
index ccd5f04..4e95cef 100644
--- a/mm_agents/openai_cua_agent.py
+++ b/mm_agents/openai_cua_agent.py
@@ -690,10 +690,10 @@ class OpenAICUAAgent:
         state_correct = False
         # if action_exit and thought_exit:
         #     state_correct = True
-        if action_exit and not message_exit:   
-           state_correct = True
-        # if action_exit:
-        #     state_correct = True
+        # if action_exit and not message_exit:   
+        #    state_correct = True
+        if action_exit:
+            state_correct = True
         if not state_correct:
             logger.warning("The state of the agent is not correct, action_exit: %s, thought_exit: %s, message_exit: %s", action_exit, thought_exit, message_exit)
             
diff --git a/monitor/.env b/monitor/.env
index 52ae50d..3984b1b 100644
--- a/monitor/.env
+++ b/monitor/.env
@@ -2,9 +2,9 @@
 # Do not write any secret keys or sensitive information here.
 
 # Monitor configuration
-TASK_CONFIG_PATH=../evaluation_examples/test_small.json
+TASK_CONFIG_PATH=../evaluation_examples/test_all.json
 EXAMPLES_BASE_PATH=../evaluation_examples/examples
-RESULTS_BASE_PATH=../results_small_endmethod_ifmessage
+RESULTS_BASE_PATH=../results_all_ifmessage_promptnochange
 ACTION_SPACE=pyautogui
 OBSERVATION_TYPE=screenshot
 MODEL_NAME=computer-use-preview
diff --git a/run_multienv_openaicua.py b/run_multienv_openaicua.py
index c7781d3..0b0d0ef 100644
--- a/run_multienv_openaicua.py
+++ b/run_multienv_openaicua.py
@@ -28,35 +28,6 @@ if os.path.exists(".env"):
     load_dotenv()
 
 #  Logger Configs {{{ #
-logger = logging.getLogger()
-logger.setLevel(logging.INFO)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(
-    os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
-)
-stdout_handler = logging.StreamHandler(sys.stdout)
-
-
-file_handler.setLevel(logging.INFO)
-stdout_handler.setLevel(logging.INFO)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
-)
-file_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(stdout_handler)
-#  }}} Logger Configs #
-
-logger = logging.getLogger("desktopenv.experiment")
-
-
 def config() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Run end-to-end evaluation on the benchmark"
@@ -103,6 +74,8 @@ def config() -> argparse.Namespace:
     # logging related
     parser.add_argument("--result_dir", type=str, default="./results")
     parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")  
+    parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 
+                       default='INFO', help="Set the logging level")
     # aws config
     parser.add_argument(
         "--region", type=str, default="us-east-1", help="AWS region for the VM"
@@ -110,6 +83,42 @@ def config() -> argparse.Namespace:
     args = parser.parse_args()
     return args
 
+args = config()  # Get command line arguments first
+
+logger = logging.getLogger()
+log_level = getattr(logging, args.log_level.upper())
+logger.setLevel(log_level)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+file_handler = logging.FileHandler(
+    os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+debug_handler = logging.FileHandler(
+    os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+stdout_handler = logging.StreamHandler(sys.stdout)
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(log_level)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
+)
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+#  }}} Logger Configs #
+
+logger = logging.getLogger("desktopenv.experiment")
+
 
 def distribute_tasks(test_all_meta: dict, num_envs: int) -> List[Dict]:
     """Distribute tasks evenly across environments."""
diff --git a/run_operator.sh b/run_operator.sh
index d45986e..b054a78 100644
--- a/run_operator.sh
+++ b/run_operator.sh
@@ -6,4 +6,4 @@ python run_multienv_openaicua.py \
 --test_all_meta_path evaluation_examples/test_all.json \
 --region us-east-1 \
 --max_steps 150 \
---num_envs 10
+--num_envs 25