feat: enhance run_coact.py with logging and configuration options

- Added logging configuration to capture runtime logs in both file and console with adjustable log levels.
- Introduced new command-line arguments for provider name, region, and client password to improve flexibility and security.
- Updated process_task function to accommodate new parameters, ensuring compatibility with existing logic.
- Modified prompt templates in coding_agent.py and cua_agent.py to use the client password placeholder for enhanced security.
This commit is contained in:
yuanmengqi
2025-07-31 05:47:58 +00:00
parent a5b51e8010
commit 84f407afdd
3 changed files with 75 additions and 22 deletions

View File

@@ -16,7 +16,7 @@ You can write code in ```bash...``` code blocks for bash scripts, and ```python.
- When you write code, you must identify the language (whether it is python or bash) of the code. - When you write code, you must identify the language (whether it is python or bash) of the code.
- Your linux username is "user". - Your linux username is "user".
- Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you. - Wrap all your code in ONE code block. DO NOT let user save the code as a file and execute it for you.
- If you want to use sudo, follow the format: "echo password | sudo -S [YOUR COMMANDS]" (no quotes for the word "password"). - If you want to use sudo, follow the format: "echo {CLIENT_PASSWORD} | sudo -S [YOUR COMMANDS]" (no quotes for the word "{CLIENT_PASSWORD}").
- Ignore the error: "sudo: /etc/sudoers.d is world writable". - Ignore the error: "sudo: /etc/sudoers.d is world writable".
- Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code. - Your python code will be sent line-by-line into a interactive python terminal. Do not include __main__ in your code.
- When import a package, you need to check if the package is installed. If not, you need to install it yourself. - When import a package, you need to check if the package is installed. If not, you need to install it yourself.

View File

@@ -18,8 +18,8 @@ PROMPT_TEMPLATE = """# Task
{instruction} {instruction}
# Hints # Hints
- Sudo password is "password". - Sudo password is "{CLIENT_PASSWORD}".
- If you meet "Authentication required" dialog, enter the "password" to continue. - If you meet "Authentication required" dialog, enter the "{CLIENT_PASSWORD}" to continue.
- Do not close the any application or window or tab that is already opened. - Do not close the any application or window or tab that is already opened.
- Do not close the window at the end of the task. - Do not close the window at the end of the task.
- If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'. - If you have completed the user task, reply with the information you want the user to know along with 'TERMINATE'.
@@ -154,6 +154,7 @@ def run_cua(
screen_height: int = 1080, screen_height: int = 1080,
sleep_after_execution: float = 0.3, sleep_after_execution: float = 0.3,
truncate_history_inputs: int = 100, truncate_history_inputs: int = 100,
client_password: str = "",
) -> Tuple[str, float]: ) -> Tuple[str, float]:
client = OpenAI() client = OpenAI()
@@ -166,7 +167,7 @@ def run_cua(
history_inputs = [{ history_inputs = [{
"role": "user", "role": "user",
"content": [ "content": [
{"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction)}, {"type": "input_text", "text": PROMPT_TEMPLATE.format(instruction=instruction, CLIENT_PASSWORD=client_password)},
{"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"}, {"type": "input_image", "image_url": f"data:image/png;base64,{screenshot_b64}"},
], ],
}] }]

View File

@@ -1,6 +1,7 @@
import argparse import argparse
import base64 import base64
import glob import glob
import datetime
import shutil import shutil
import traceback import traceback
from typing import Dict, List from typing import Dict, List
@@ -11,9 +12,7 @@ from mm_agents.coact.autogen import LLMConfig
import logging import logging
from multiprocessing import Pool, cpu_count from multiprocessing import Pool, cpu_count
from functools import partial from functools import partial
import sys
logger = logging.getLogger("desktopenv")
TASK_DESCRIPTION = """# Your role TASK_DESCRIPTION = """# Your role
@@ -50,10 +49,13 @@ def config() -> argparse.Namespace:
) )
# environment config # environment config
parser.add_argument("--path_to_vm", type=str, default=os.environ["VMS_DIR"] + "/Ubuntu.qcow2") parser.add_argument("--path_to_vm", type=str, default=None)
parser.add_argument("--provider_name", type=str, default="docker")
parser.add_argument("--screen_width", type=int, default=1920) parser.add_argument("--screen_width", type=int, default=1920)
parser.add_argument("--screen_height", type=int, default=1080) parser.add_argument("--screen_height", type=int, default=1080)
parser.add_argument("--sleep_after_execution", type=float, default=0.5) parser.add_argument("--sleep_after_execution", type=float, default=0.5)
parser.add_argument("--region", type=str, default="us-east-1")
parser.add_argument("--client_password", type=str, default="")
# agent config # agent config
parser.add_argument("--oai_config_path", type=str, default="OAI_CONFIG_LIST") parser.add_argument("--oai_config_path", type=str, default="OAI_CONFIG_LIST")
@@ -77,24 +79,67 @@ def config() -> argparse.Namespace:
# logging related # logging related
parser.add_argument("--result_dir", type=str, default="./results") parser.add_argument("--result_dir", type=str, default="./results")
parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel") parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
default='INFO', help="Set the logging level")
args = parser.parse_args() args = parser.parse_args()
return args return args
args = config()
logger = logging.getLogger()
log_level = getattr(logging, args.log_level.upper())
logger.setLevel(log_level)
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
file_handler = logging.FileHandler(
os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
)
debug_handler = logging.FileHandler(
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
)
stdout_handler = logging.StreamHandler(sys.stdout)
file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG)
stdout_handler.setLevel(log_level)
formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
)
file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter)
stdout_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler)
logger.addHandler(debug_handler)
logger.addHandler(stdout_handler)
# }}} Logger Configs #
logger = logging.getLogger("desktopenv.expeiment")
def process_task(task_info, def process_task(task_info,
path_to_vm, provider_name,
orchestrator_model="o3", path_to_vm,
coding_model='o4-mini', orchestrator_model="o3",
save_dir='results', coding_model='o4-mini',
orchestrator_max_steps=15, save_dir='results',
cua_max_steps=25, orchestrator_max_steps=15,
coding_max_steps=20, cua_max_steps=25,
cut_off_steps=150, coding_max_steps=20,
screen_width=1920, cut_off_steps=150,
screen_height=1080, screen_width=1920,
sleep_after_execution=0.5, screen_height=1080,
config_path="OAI_CONFIG_LIST"): sleep_after_execution=0.5,
config_path="OAI_CONFIG_LIST",
region="us-east-1",
client_password="",
):
"""Worker function to process a single task""" """Worker function to process a single task"""
domain, ex_id, cfg = task_info domain, ex_id, cfg = task_info
@@ -118,6 +163,7 @@ def process_task(task_info,
name="orchestrator_proxy", name="orchestrator_proxy",
is_termination_msg=lambda x: x.get("content", "") and ("terminate" in x.get("content", "")[0]["text"].lower() or "infeasible" in x.get("content", "")[0]["text"].lower()), is_termination_msg=lambda x: x.get("content", "") and ("terminate" in x.get("content", "")[0]["text"].lower() or "infeasible" in x.get("content", "")[0]["text"].lower()),
human_input_mode="NEVER", human_input_mode="NEVER",
provider_name=provider_name,
path_to_vm=path_to_vm, path_to_vm=path_to_vm,
screen_width=screen_width, screen_width=screen_width,
screen_height=screen_height, screen_height=screen_height,
@@ -128,6 +174,8 @@ def process_task(task_info,
truncate_history_inputs=cua_max_steps + 1, truncate_history_inputs=cua_max_steps + 1,
cua_max_steps=cua_max_steps, cua_max_steps=cua_max_steps,
coding_max_steps=coding_max_steps, coding_max_steps=coding_max_steps,
region=region,
client_password=client_password
) )
obs = orchestrator_proxy.reset(task_config=task_config) obs = orchestrator_proxy.reset(task_config=task_config)
@@ -237,6 +285,7 @@ if __name__ == "__main__":
# Create a partial function with fixed config_path, model and debug # Create a partial function with fixed config_path, model and debug
process_func = partial(process_task, process_func = partial(process_task,
provider_name=args.provider_name,
path_to_vm=args.path_to_vm, path_to_vm=args.path_to_vm,
save_dir=args.result_dir, save_dir=args.result_dir,
coding_model=args.coding_model, coding_model=args.coding_model,
@@ -248,7 +297,10 @@ if __name__ == "__main__":
cut_off_steps=args.cut_off_steps, cut_off_steps=args.cut_off_steps,
screen_width=args.screen_width, screen_width=args.screen_width,
screen_height=args.screen_height, screen_height=args.screen_height,
sleep_after_execution=args.sleep_after_execution) sleep_after_execution=args.sleep_after_execution,
region=args.region,
client_password=args.client_password
)
# Process tasks in parallel # Process tasks in parallel
with Pool(processes=num_workers) as pool: with Pool(processes=num_workers) as pool: