Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies.
* Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic.
* Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis.
* Added a **tools module** containing utility functions and tool configurations to improve code reusability.
* Updated the **README** and documentation with usage examples and module descriptions.

These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience.

Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
This commit is contained in:
Hiroid
2025-09-08 15:07:21 +08:00
committed by GitHub
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions

View File

@@ -0,0 +1,549 @@
import argparse
import json
import datetime
import io
import logging
import os
import platform
import sys
import time
from tqdm import tqdm
from pathlib import Path
from dotenv import load_dotenv
from gui_agents.maestro.controller.main_controller import MainController
# Import analyze_display functionality
from gui_agents.utils.analyze_display import analyze_display_json, format_output_line
from desktop_env.desktop_env import DesktopEnv
from gui_agents.utils.common_utils import ImageDataFilter, SafeLoggingFilter
env_path = Path(os.path.dirname(os.path.abspath(__file__))) / '.env'
if env_path.exists():
load_dotenv(dotenv_path=env_path)
else:
parent_env_path = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) / '.env'
if parent_env_path.exists():
load_dotenv(dotenv_path=parent_env_path)
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
vm_datetime_str: str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
log_dir = "runtime"
vm_log_dir = os.path.join(log_dir, f"vmrun_{vm_datetime_str}")
os.makedirs(vm_log_dir, exist_ok=True)
file_handler = logging.FileHandler(
os.path.join(vm_log_dir, "vmrun_normal.log"), encoding="utf-8"
)
debug_handler = logging.FileHandler(
os.path.join(vm_log_dir, "vmrun_debug.log"), encoding="utf-8"
)
stdout_handler = logging.StreamHandler(sys.stdout)
sdebug_handler = logging.FileHandler(
os.path.join(vm_log_dir, "vmrun_sdebug.log"), encoding="utf-8"
)
file_handler.setLevel(logging.INFO)
debug_handler.setLevel(logging.DEBUG)
stdout_handler.setLevel(logging.INFO)
sdebug_handler.setLevel(logging.DEBUG)
# Add SafeLoggingFilter to prevent format errors from third-party libraries (like OpenAI)
safe_filter = SafeLoggingFilter()
debug_handler.addFilter(safe_filter)
sdebug_handler.addFilter(safe_filter)
file_handler.addFilter(safe_filter)
stdout_handler.addFilter(safe_filter)
# Also apply SafeLoggingFilter to OpenAI library loggers
try:
import openai
openai_logger = logging.getLogger('openai')
openai_logger.addFilter(safe_filter)
httpx_logger = logging.getLogger('httpx')
httpx_logger.addFilter(safe_filter)
except ImportError:
pass
if os.getenv('KEEP_IMAGE_LOGS', 'false').lower() != 'true':
image_filter = ImageDataFilter()
debug_handler.addFilter(image_filter)
sdebug_handler.addFilter(image_filter)
logger.info("Image data filtering enabled - image data in debug logs will be filtered")
else:
logger.info("Image data filtering disabled - debug logs will contain complete image data")
logger.info("Safe logging filter enabled - prevents format errors from third-party libraries (OpenAI, HTTPX)")
formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
)
file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter)
sdebug_handler.setFormatter(formatter)
stdout_handler.addFilter(logging.Filter("desktopenv"))
sdebug_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler)
logger.addHandler(debug_handler)
logger.addHandler(stdout_handler)
logger.addHandler(sdebug_handler)
logger = logging.getLogger("desktopenv.experiment")
def config() -> argparse.Namespace:
parser = argparse.ArgumentParser(
description="Run end-to-end evaluation on the benchmark"
)
current_platform = os.getenv("USE_PRECREATE_VM", "Windows")
if current_platform == "Ubuntu":
path_to_vm = os.path.join("vmware_vm_data", "Ubuntu0", "Ubuntu0.vmx")
test_config_base_dir = os.path.join("evaluation_examples", "examples")
test_all_meta_path = os.path.join("evaluation_examples", "test_tiny.json")
elif current_platform == "Windows":
path_to_vm = os.path.join("vmware_vm_data", "Windows0", "Windows0.vmx")
test_config_base_dir = os.path.join("evaluation_examples", "examples_windows")
test_all_meta_path = os.path.join("evaluation_examples", "test_tiny_windows.json")
else:
raise ValueError(f"USE_PRECREATE_VM={current_platform} is not supported. Please use Ubuntu or Windows.")
# platform config
parser.add_argument(
"--current_platform",
type=str,
choices=["Ubuntu", "Windows"],
default=current_platform,
help="Platform to run on (Ubuntu or Windows)"
)
# environment config
# vm_path will be set based on platform
parser.add_argument("--path_to_vm", type=str, default=path_to_vm)
parser.add_argument(
"--headless", action="store_true", help="Run in headless machine"
)
parser.add_argument(
"--action_space", type=str, default="pyautogui", help="Action type"
)
parser.add_argument(
"--observation_type",
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
default="screenshot",
help="Observation type",
)
parser.add_argument("--max_steps", type=int, default=50)
# agent config
parser.add_argument(
"--test_config_base_dir", type=str, default=test_config_base_dir
)
# example config
parser.add_argument("--domain", type=str, default="all")
parser.add_argument(
"--test_all_meta_path", type=str, default=test_all_meta_path
)
# logging related
parser.add_argument("--result_dir", type=str, default="./results")
args = parser.parse_args()
return args
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
scores = []
# log args
logger.info("Args: %s", args)
cfg_args = {
"path_to_vm": args.path_to_vm,
"headless": args.headless,
"action_space": args.action_space,
"observation_type": args.observation_type,
"max_steps": args.max_steps,
"result_dir": args.result_dir,
}
env = DesktopEnv(
provider_name="vmware",
path_to_vm=args.path_to_vm,
action_space=args.action_space,
headless=args.headless,
require_a11y_tree=False,
)
for domain in tqdm(test_all_meta, desc="Domain"):
domain_sanitized = str(domain).strip()
for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
example_id_sanitized = str(example_id).strip()
config_file = os.path.join(
args.test_config_base_dir,
domain_sanitized,
f"{example_id_sanitized}.json"
)
if not os.path.exists(config_file):
try:
candidate_dir = os.path.join(args.test_config_base_dir, domain_sanitized)
existing_files = []
if os.path.isdir(candidate_dir):
existing_files = os.listdir(candidate_dir)
logger.error(f"Config file not found: {config_file}")
logger.error(f"Existing files in {candidate_dir}: {existing_files}")
except Exception as e:
logger.error(f"Error while listing directory for debug: {e}")
raise FileNotFoundError(config_file)
with open(config_file, "r", encoding="utf-8") as f:
example = json.load(f)
logger.info(f"[Domain]: {domain_sanitized}")
logger.info(f"[Example ID]: {example_id_sanitized}")
user_query = example["instruction"]
logger.info(f"[User Query]: {user_query}")
# wandb each example config settings
cfg_args["user_query"] = user_query
cfg_args["start_time"] = datetime.datetime.now().strftime(
"%Y:%m:%d-%H:%M:%S"
)
# Create a separate timestamp folder for each example
example_datetime_str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
example_result_dir = os.path.join(
args.result_dir,
args.action_space,
args.observation_type,
domain,
example_id,
)
os.makedirs(example_result_dir, exist_ok=True)
# example start running
try:
run_single_example(
env,
example,
user_query,
args,
example_result_dir,
scores,
vm_log_dir, # Pass the timestamp directory to run_single_example
example_datetime_str
)
except Exception as e:
logger.error(f"Exception in {domain}/{example_id}: {e}")
env.controller.end_recording(
os.path.join(example_result_dir, "recording.mp4")
)
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(
json.dumps(
{"Error": f"Time limit exceeded in {domain}/{example_id}"}
)
)
f.write("\n")
env.close()
if scores:
logger.info(f"Average score: {sum(scores) / len(scores)}")
else:
logger.info("No scores recorded - no examples were completed")
def run_single_example(
env: DesktopEnv,
example,
user_query: str,
args,
example_result_dir,
scores,
vm_log_dir: str,
example_datetime_str: str
):
# Set up a separate logger for each example
example_timestamp_dir = os.path.join(vm_log_dir, example_datetime_str)
total_start_time = time.time()
cache_dir = os.path.join(example_timestamp_dir, "cache", "screens")
state_dir = os.path.join(example_timestamp_dir, "state")
os.makedirs(cache_dir, exist_ok=True)
os.makedirs(state_dir, exist_ok=True)
example_logger = setup_example_logger(example, example_timestamp_dir)
example_logger.info(f"Starting example {example.get('id', 'unknown')}")
example_logger.info(f"User Query: {user_query}")
env.reset(task_config=example)
controller = MainController(
platform=args.current_platform,
backend="pyautogui_vmware",
user_query=user_query,
max_steps=args.max_steps,
env=env,
log_dir=vm_log_dir,
datetime_str=example_datetime_str
)
env.controller.start_recording()
try:
# Set the user query in the controller
controller.execute_main_loop()
# Check task status after execution to determine if task was successful
task = controller.global_state.get_task()
if task and task.status == "fulfilled":
# Task completed successfully
logger.info("Task completed successfully")
env.step("DONE")
elif task and task.status == "rejected":
# Task was rejected/failed
logger.info("Task was rejected/failed")
env.step("FAIL")
else:
# Task status unknown or incomplete
logger.info("Task execution completed with unknown status")
env.step("DONE")
except Exception as e:
logger.error(f"Error during maestro execution: {e}")
raise
finally:
total_end_time = time.time()
total_duration = total_end_time - total_start_time
logger.info(f"Total execution time: {total_duration:.2f} seconds")
# Auto-analyze execution statistics after task completion
auto_analyze_execution(example_timestamp_dir)
result = env.evaluate()
logger.info("Result: %.2f", result)
example_logger.info("Result: %.2f", result)
example_logger.info(f"Example {example.get('id', 'unknown')} completed with result: {result}")
scores.append(result)
with open(
os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8"
) as f:
f.write(f"{result}\n")
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
def auto_analyze_execution(timestamp_dir: str):
"""
Automatically analyze execution statistics from display.json files after task completion
Args:
timestamp_dir: Directory containing the execution logs and display.json
"""
import time
try:
# Analyze the display.json file for this execution
display_json_path = os.path.join(timestamp_dir, "display.json")
# Wait for file to be fully written
max_wait_time = 10 # Maximum wait time in seconds
wait_interval = 0.5 # Check every 0.5 seconds
waited_time = 0
while waited_time < max_wait_time:
if os.path.exists(display_json_path):
# Check if file is still being written by monitoring its size
try:
size1 = os.path.getsize(display_json_path)
time.sleep(wait_interval)
size2 = os.path.getsize(display_json_path)
# If file size hasn't changed in the last 0.5 seconds, it's likely complete
if size1 == size2:
logger.info(f"Display.json file appears to be complete (size: {size1} bytes)")
break
else:
logger.info(f"Display.json file still being written (size changed from {size1} to {size2} bytes)")
waited_time += wait_interval
continue
except OSError:
# File might be temporarily inaccessible
time.sleep(wait_interval)
waited_time += wait_interval
continue
else:
logger.info(f"Waiting for display.json file to be created... ({waited_time:.1f}s)")
time.sleep(wait_interval)
waited_time += wait_interval
if os.path.exists(display_json_path):
logger.info(f"Auto-analyzing execution statistics from: {display_json_path}")
# Analyze the single display.json file
result = analyze_display_json(display_json_path)
if result:
# Format and log the statistics
output_line = format_output_line(result)
logger.info("=" * 80)
logger.info("EXECUTION STATISTICS:")
logger.info("Steps, Duration (seconds), (Input Tokens, Output Tokens, Total Tokens), Cost")
logger.info("=" * 80)
logger.info(output_line)
logger.info("=" * 80)
else:
logger.warning("No valid data found in display.json for analysis")
else:
logger.warning(f"Display.json file not found at: {display_json_path} after waiting {max_wait_time} seconds")
except Exception as e:
logger.error(f"Error during auto-analysis: {e}")
def setup_example_logger(example, example_timestamp_dir):
example_id = example.get('id', 'unknown')
example_logger = logging.getLogger(f"example.{example_id}.{datetime.datetime.now().strftime('%Y%m%d_%H%M%S')}")
example_logger.setLevel(logging.DEBUG)
example_logger.handlers.clear()
log_file = os.path.join(example_timestamp_dir, "example.log")
file_handler = logging.FileHandler(log_file, encoding="utf-8")
file_handler.setLevel(logging.DEBUG)
debug_log_file = os.path.join(example_timestamp_dir, "example_debug.log")
debug_handler = logging.FileHandler(debug_log_file, encoding="utf-8")
debug_handler.setLevel(logging.DEBUG)
formatter = logging.Formatter(
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
)
file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter)
example_logger.addHandler(file_handler)
example_logger.addHandler(debug_handler)
return example_logger
def get_unfinished(
action_space, observation_type, result_dir, total_file_json
):
target_dir = os.path.join(result_dir, action_space, observation_type)
if not os.path.exists(target_dir):
return total_file_json
finished = {}
for domain in os.listdir(target_dir):
finished[domain] = []
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
for example_id in os.listdir(domain_path):
if example_id == "onboard":
continue
example_path = os.path.join(domain_path, example_id)
if os.path.isdir(example_path):
if "result.txt" not in os.listdir(example_path):
# empty all files under example_id
for file in os.listdir(example_path):
os.remove(os.path.join(example_path, file))
else:
finished[domain].append(example_id)
if not finished:
return total_file_json
for domain, examples in finished.items():
if domain in total_file_json:
total_file_json[domain] = [
x for x in total_file_json[domain] if x not in examples
]
return total_file_json
def get_result(action_space, observation_type, result_dir, total_file_json):
target_dir = os.path.join(result_dir, action_space, observation_type)
if not os.path.exists(target_dir):
print("New experiment, no result yet.")
return None
all_result = []
for domain in os.listdir(target_dir):
domain_path = os.path.join(target_dir, domain)
if os.path.isdir(domain_path):
for example_id in os.listdir(domain_path):
example_path = os.path.join(domain_path, example_id)
if os.path.isdir(example_path):
if "result.txt" in os.listdir(example_path):
# empty all files under example_id
try:
all_result.append(
float(
open(
os.path.join(example_path, "result.txt"), "r"
).read()
)
)
except:
all_result.append(0.0)
if not all_result:
print("New experiment, no result yet.")
return None
else:
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
return all_result
if __name__ == "__main__":
"""
python gui_agents/osworld_run_maestro.py --max_steps 3
python gui_agents/osworld_run_maestro.py --test_all_meta_path evaluation_examples/test_tiny-answer_question.json
"""
os.environ["TOKENIZERS_PARALLELISM"] = "false"
args = config()
# Normalize to absolute paths to avoid relative path dependency on current working directory
try:
repo_root = Path(__file__).resolve().parents[1]
if not os.path.isabs(args.test_config_base_dir):
args.test_config_base_dir = str((repo_root / args.test_config_base_dir).resolve())
if not os.path.isabs(args.test_all_meta_path):
args.test_all_meta_path = str((repo_root / args.test_all_meta_path).resolve())
if not os.path.isabs(args.path_to_vm):
args.path_to_vm = str((repo_root / args.path_to_vm).resolve())
except Exception:
pass
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
test_all_meta = json.load(f)
if args.domain != "all":
test_all_meta = {args.domain: test_all_meta[args.domain]}
test_file_list = get_unfinished(
args.action_space,
args.observation_type,
args.result_dir,
test_all_meta,
)
left_info = ""
for domain in test_file_list:
left_info += f"{domain}: {len(test_file_list[domain])}\n"
logger.info(f"Left tasks:\n{left_info}")
get_result(
args.action_space,
args.observation_type,
args.result_dir,
test_all_meta,
)
test(args, test_file_list)