From 84f407afdd7abb3dc6f4a7d0e230fbca84609cba Mon Sep 17 00:00:00 2001
From: yuanmengqi <yuanmengqi@mail.ustc.edu.cn>
Date: Thu, 31 Jul 2025 05:47:58 +0000
Subject: [PATCH] feat: enhance run_coact.py with logging and configuration
 options

- Added logging configuration to capture runtime logs in both file and console with adjustable log levels.
- Introduced new command-line arguments for provider name, region, and client password to improve flexibility and security.
- Updated process_task function to accommodate new parameters, ensuring compatibility with existing logic.
- Modified prompt templates in coding_agent.py and cua_agent.py to use the client password placeholder for enhanced security.
---
 mm_agents/coact/coding_agent.py |  2 +-
 mm_agents/coact/cua_agent.py    |  7 +--
 run_coact.py                    | 88 ++++++++++++++++++++++++++-------
 3 files changed, 75 insertions(+), 22 deletions(-)

diff --git a/mm_agents/coact/coding_agent.py b/mm_agents/coact/coding_agent.py
index cfd8507..f180bf4 100644
--- a/mm_agents/coact/coding_agent.py
+++ b/mm_agents/coact/coding_agent.py
@@ -16,7 +16,7 @@ You can write code in ```bash...``` code blocks for bash scripts, and ```python.
 - When you write code, you must identify the language (whether it is python or bash) of the code.
 - Your linux username is "user".
 - Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
-- If you want to use sudo, follow the format: "echo password | sudo -S [YOUR COMMANDS]" (no quotes for the word "password").
+- If you want to use sudo, follow the format: "echo {CLIENT_PASSWORD} | sudo -S [YOUR COMMANDS]" (no quotes for the word "{CLIENT_PASSWORD}").
 - Ignore the error: "sudo: /etc/sudoers.d is world writable".
 - Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code.
 - When import a package, you need to check if the package is installed. If not, you need to install it yourself.
diff --git a/mm_agents/coact/cua_agent.py b/mm_agents/coact/cua_agent.py
index d90e9d4..c0c6522 100644
--- a/mm_agents/coact/cua_agent.py
+++ b/mm_agents/coact/cua_agent.py
@@ -18,8 +18,8 @@ PROMPT_TEMPLATE = """# Task
 {instruction}
 
 # Hints
-- Sudo password is "password".
-- If you meet "Authentication required" dialog, enter the "password" to continue.
+- Sudo password is "{CLIENT_PASSWORD}".
+- If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue.
 - Do not close the any application or window or tab that is already opened.
 - Do not close the window at the end of the task.
 - If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'.
@@ -154,6 +154,7 @@ def run_cua(
     screen_height: int = 1080,
     sleep_after_execution: float = 0.3,
     truncate_history_inputs: int = 100,
+    client_password: str = "",
 ) -> Tuple[str, float]:
     client = OpenAI()
 
@@ -166,7 +167,7 @@ def run_cua(
     history_inputs = [{
         "role": "user",
         "content": [
-            {"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction)},
+            {"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction, CLIENT_PASSWORD=client_password)},
             {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
         ],
     }]
diff --git a/run_coact.py b/run_coact.py
index 3a6f21d..79b35cd 100644
--- a/run_coact.py
+++ b/run_coact.py
@@ -1,6 +1,7 @@
 import argparse
 import base64
 import glob
+import datetime
 import shutil
 import traceback
 from typing import Dict, List
@@ -11,9 +12,7 @@ from mm_agents.coact.autogen import LLMConfig
 import logging
 from multiprocessing import Pool, cpu_count
 from functools import partial
-
-
-logger = logging.getLogger("desktopenv")
+import sys
 
 
 TASK_DESCRIPTION = """# Your role
@@ -50,10 +49,13 @@ def config() -> argparse.Namespace:
     )
 
     # environment config
-    parser.add_argument("--path_to_vm", type=str, default=os.environ["VMS_DIR"] + "/Ubuntu.qcow2")
+    parser.add_argument("--path_to_vm", type=str, default=None)
+    parser.add_argument("--provider_name", type=str, default="docker")
     parser.add_argument("--screen_width", type=int, default=1920)
     parser.add_argument("--screen_height", type=int, default=1080)
     parser.add_argument("--sleep_after_execution", type=float, default=0.5)
+    parser.add_argument("--region", type=str, default="us-east-1")
+    parser.add_argument("--client_password", type=str, default="")
 
     # agent config
     parser.add_argument("--oai_config_path", type=str, default="OAI_CONFIG_LIST")
@@ -77,24 +79,67 @@ def config() -> argparse.Namespace:
     # logging related
     parser.add_argument("--result_dir", type=str, default="./results")
     parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
-    
+    parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], 
+                       default='INFO', help="Set the logging level")
+
     args = parser.parse_args()
     return args
 
+args = config()
+
+logger = logging.getLogger()
+
+log_level = getattr(logging, args.log_level.upper())
+logger.setLevel(log_level)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+file_handler = logging.FileHandler(
+    os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+debug_handler = logging.FileHandler(
+    os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
+)
+stdout_handler = logging.StreamHandler(sys.stdout)
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(log_level)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
+)
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+#  }}} Logger Configs #
+
+logger = logging.getLogger("desktopenv.expeiment")
+
 
 def process_task(task_info, 
-                 path_to_vm,
-                 orchestrator_model="o3",
-                 coding_model='o4-mini',
-                 save_dir='results',
-                 orchestrator_max_steps=15,
-                 cua_max_steps=25,
-                 coding_max_steps=20,
-                 cut_off_steps=150,
-                 screen_width=1920,
-                 screen_height=1080,
-                 sleep_after_execution=0.5,
-                 config_path="OAI_CONFIG_LIST"):
+                provider_name,
+                path_to_vm,
+                orchestrator_model="o3",
+                coding_model='o4-mini',
+                save_dir='results',
+                orchestrator_max_steps=15,
+                cua_max_steps=25,
+                coding_max_steps=20,
+                cut_off_steps=150,
+                screen_width=1920,
+                screen_height=1080,
+                sleep_after_execution=0.5,
+                config_path="OAI_CONFIG_LIST",
+                region="us-east-1",
+                client_password="",
+                ):
     """Worker function to process a single task"""
     domain, ex_id, cfg = task_info
     
@@ -118,6 +163,7 @@ def process_task(task_info,
                     name="orchestrator_proxy",
                     is_termination_msg=lambda x: x.get("content", "") and ("terminate" in x.get("content", "")[0]["text"].lower() or "infeasible" in x.get("content", "")[0]["text"].lower()),
                     human_input_mode="NEVER",
+                    provider_name=provider_name,
                     path_to_vm=path_to_vm,
                     screen_width=screen_width,
                     screen_height=screen_height,
@@ -128,6 +174,8 @@ def process_task(task_info,
                     truncate_history_inputs=cua_max_steps + 1,
                     cua_max_steps=cua_max_steps,
                     coding_max_steps=coding_max_steps,
+                    region=region,
+                    client_password=client_password
                 )
 
             obs = orchestrator_proxy.reset(task_config=task_config)
@@ -237,6 +285,7 @@ if __name__ == "__main__":
 
         # Create a partial function with fixed config_path, model and debug
         process_func = partial(process_task, 
+                               provider_name=args.provider_name,
                                path_to_vm=args.path_to_vm,
                                save_dir=args.result_dir,
                                coding_model=args.coding_model,
@@ -248,7 +297,10 @@ if __name__ == "__main__":
                                cut_off_steps=args.cut_off_steps,
                                screen_width=args.screen_width,
                                screen_height=args.screen_height,
-                               sleep_after_execution=args.sleep_after_execution)
+                               sleep_after_execution=args.sleep_after_execution,
+                               region=args.region,
+                               client_password=args.client_password
+                               )
 
         # Process tasks in parallel
         with Pool(processes=num_workers) as pool: