From 84f407afdd7abb3dc6f4a7d0e230fbca84609cba Mon Sep 17 00:00:00 2001 From: yuanmengqi Date: Thu, 31 Jul 2025 05:47:58 +0000 Subject: [PATCH] feat: enhance run_coact.py with logging and configuration options - Added logging configuration to capture runtime logs in both file and console with adjustable log levels. - Introduced new command-line arguments for provider name, region, and client password to improve flexibility and security. - Updated process_task function to accommodate new parameters, ensuring compatibility with existing logic. - Modified prompt templates in coding_agent.py and cua_agent.py to use the client password placeholder for enhanced security. --- mm_agents/coact/coding_agent.py | 2 +- mm_agents/coact/cua_agent.py | 7 +-- run_coact.py | 88 ++++++++++++++++++++++++++------- 3 files changed, 75 insertions(+), 22 deletions(-) diff --git a/mm_agents/coact/coding_agent.py b/mm_agents/coact/coding_agent.py index cfd8507..f180bf4 100644 --- a/mm_agents/coact/coding_agent.py +++ b/mm_agents/coact/coding_agent.py @@ -16,7 +16,7 @@ You can write code in ```bash...``` code blocks for bash scripts, and ```python. - When you write code, you must identify the language (whether it is python or bash) of the code. - Your linux username is "user". - Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you. -- If you want to use sudo, follow the format: "echo password | sudo -S [YOUR COMMANDS]" (no quotes for the word "password"). +- If you want to use sudo, follow the format: "echo {CLIENT_PASSWORD} | sudo -S [YOUR COMMANDS]" (no quotes for the word "{CLIENT_PASSWORD}"). - Ignore the error: "sudo: /etc/sudoers.d is world writable". - Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code. - When import a package, you need to check if the package is installed. If not, you need to install it yourself. diff --git a/mm_agents/coact/cua_agent.py b/mm_agents/coact/cua_agent.py index d90e9d4..c0c6522 100644 --- a/mm_agents/coact/cua_agent.py +++ b/mm_agents/coact/cua_agent.py @@ -18,8 +18,8 @@ PROMPT_TEMPLATE = """# Task {instruction} # Hints -- Sudo password is "password". -- If you meet "Authentication required" dialog, enter the "password" to continue. +- Sudo password is "{CLIENT_PASSWORD}". +- If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue. - Do not close the any application or window or tab that is already opened. - Do not close the window at the end of the task. - If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'. @@ -154,6 +154,7 @@ def run_cua( screen_height: int = 1080, sleep_after_execution: float = 0.3, truncate_history_inputs: int = 100, + client_password: str = "", ) -> Tuple[str, float]: client = OpenAI() @@ -166,7 +167,7 @@ def run_cua( history_inputs = [{ "role": "user", "content": [ - {"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction)}, + {"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction, CLIENT_PASSWORD=client_password)}, {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"}, ], }] diff --git a/run_coact.py b/run_coact.py index 3a6f21d..79b35cd 100644 --- a/run_coact.py +++ b/run_coact.py @@ -1,6 +1,7 @@ import argparse import base64 import glob +import datetime import shutil import traceback from typing import Dict, List @@ -11,9 +12,7 @@ from mm_agents.coact.autogen import LLMConfig import logging from multiprocessing import Pool, cpu_count from functools import partial - - -logger = logging.getLogger("desktopenv") +import sys TASK_DESCRIPTION = """# Your role @@ -50,10 +49,13 @@ def config() -> argparse.Namespace: ) # environment config - parser.add_argument("--path_to_vm", type=str, default=os.environ["VMS_DIR"] + "/Ubuntu.qcow2") + parser.add_argument("--path_to_vm", type=str, default=None) + parser.add_argument("--provider_name", type=str, default="docker") parser.add_argument("--screen_width", type=int, default=1920) parser.add_argument("--screen_height", type=int, default=1080) parser.add_argument("--sleep_after_execution", type=float, default=0.5) + parser.add_argument("--region", type=str, default="us-east-1") + parser.add_argument("--client_password", type=str, default="") # agent config parser.add_argument("--oai_config_path", type=str, default="OAI_CONFIG_LIST") @@ -77,24 +79,67 @@ def config() -> argparse.Namespace: # logging related parser.add_argument("--result_dir", type=str, default="./results") parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel") - + parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], + default='INFO', help="Set the logging level") + args = parser.parse_args() return args +args = config() + +logger = logging.getLogger() + +log_level = getattr(logging, args.log_level.upper()) +logger.setLevel(log_level) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler( + os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8" +) +debug_handler = logging.FileHandler( + os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8" +) +stdout_handler = logging.StreamHandler(sys.stdout) + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(log_level) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s" +) +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.expeiment") + def process_task(task_info, - path_to_vm, - orchestrator_model="o3", - coding_model='o4-mini', - save_dir='results', - orchestrator_max_steps=15, - cua_max_steps=25, - coding_max_steps=20, - cut_off_steps=150, - screen_width=1920, - screen_height=1080, - sleep_after_execution=0.5, - config_path="OAI_CONFIG_LIST"): + provider_name, + path_to_vm, + orchestrator_model="o3", + coding_model='o4-mini', + save_dir='results', + orchestrator_max_steps=15, + cua_max_steps=25, + coding_max_steps=20, + cut_off_steps=150, + screen_width=1920, + screen_height=1080, + sleep_after_execution=0.5, + config_path="OAI_CONFIG_LIST", + region="us-east-1", + client_password="", + ): """Worker function to process a single task""" domain, ex_id, cfg = task_info @@ -118,6 +163,7 @@ def process_task(task_info, name="orchestrator_proxy", is_termination_msg=lambda x: x.get("content", "") and ("terminate" in x.get("content", "")[0]["text"].lower() or "infeasible" in x.get("content", "")[0]["text"].lower()), human_input_mode="NEVER", + provider_name=provider_name, path_to_vm=path_to_vm, screen_width=screen_width, screen_height=screen_height, @@ -128,6 +174,8 @@ def process_task(task_info, truncate_history_inputs=cua_max_steps + 1, cua_max_steps=cua_max_steps, coding_max_steps=coding_max_steps, + region=region, + client_password=client_password ) obs = orchestrator_proxy.reset(task_config=task_config) @@ -237,6 +285,7 @@ if __name__ == "__main__": # Create a partial function with fixed config_path, model and debug process_func = partial(process_task, + provider_name=args.provider_name, path_to_vm=args.path_to_vm, save_dir=args.result_dir, coding_model=args.coding_model, @@ -248,7 +297,10 @@ if __name__ == "__main__": cut_off_steps=args.cut_off_steps, screen_width=args.screen_width, screen_height=args.screen_height, - sleep_after_execution=args.sleep_after_execution) + sleep_after_execution=args.sleep_after_execution, + region=args.region, + client_password=args.client_password + ) # Process tasks in parallel with Pool(processes=num_workers) as pool: