Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
2025-09-08 15:07:21 +08:00
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions
--- a/mm_agents/maestro/cli_app_maestro.py
+++ b/mm_agents/maestro/cli_app_maestro.py
@@ -0,0 +1,346 @@
+import argparse
+from ast import arg
+import datetime
+import io
+import logging
+import os
+import platform
+import sys
+import time
+from pathlib import Path
+from dotenv import load_dotenv
+from PIL import Image
+from gui_agents.maestro.controller.main_controller import MainController
+# Import analyze_display functionality
+from gui_agents.utils.analyze_display import analyze_display_json, aggregate_results, format_output_line
+from gui_agents.utils.common_utils import show_task_completion_notification
+from desktop_env.desktop_env import DesktopEnv
+from gui_agents.utils.common_utils import ImageDataFilter, SafeLoggingFilter
+
+env_path = Path(os.path.dirname(os.path.abspath(__file__))) / '.env'
+if env_path.exists():
+    load_dotenv(dotenv_path=env_path)
+else:
+    parent_env_path = Path(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) / '.env'
+    if parent_env_path.exists():
+        load_dotenv(dotenv_path=parent_env_path)
+
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+
+log_dir = "runtime"
+os.makedirs(os.path.join(log_dir, datetime_str), exist_ok=True)
+
+file_handler = logging.FileHandler(
+    os.path.join(log_dir, datetime_str, "agents3.log"), encoding="utf-8"
+)
+debug_handler = logging.FileHandler(
+    os.path.join(log_dir, datetime_str, "agents3_debug.log"), encoding="utf-8"
+)
+stdout_handler = logging.StreamHandler(sys.stdout)
+
+# Add dedicated doubao API log handler
+doubao_handler = logging.FileHandler(
+    os.path.join(log_dir, datetime_str, "doubao_api.log"), encoding="utf-8"
+)
+
+# Create dedicated doubao API logger
+doubao_logger = logging.getLogger("doubao_api")
+doubao_logger.setLevel(logging.DEBUG)
+doubao_logger.addHandler(doubao_handler)
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(logging.INFO)
+doubao_handler.setLevel(logging.DEBUG)
+
+# Add SafeLoggingFilter to prevent format errors from third-party libraries (like OpenAI)
+safe_filter = SafeLoggingFilter()
+debug_handler.addFilter(safe_filter)
+
+# Also apply SafeLoggingFilter to OpenAI library loggers
+try:
+    import openai
+    openai_logger = logging.getLogger('openai')
+    openai_logger.addFilter(safe_filter)
+    httpx_logger = logging.getLogger('httpx')
+    httpx_logger.addFilter(safe_filter)
+    logger.info("SafeLoggingFilter applied to third-party libraries (OpenAI, HTTPX)")
+except ImportError:
+    logger.info("SafeLoggingFilter applied to main handlers only (OpenAI not available)")
+    pass
+
+if os.getenv('KEEP_IMAGE_LOGS', 'false').lower() != 'true':
+    image_filter = ImageDataFilter()
+    debug_handler.addFilter(image_filter)
+    logger.info("Image data filtering enabled - image data in debug logs will be filtered")
+else:
+    logger.info("Image data filtering disabled - debug logs will contain complete image data")
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
+)
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+doubao_handler.setFormatter(formatter)
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+
+
+def auto_analyze_execution(timestamp_dir: str):
+    """
+    Automatically analyze execution statistics from display.json files after task completion
+    
+    Args:
+        timestamp_dir: Directory containing the execution logs and display.json
+    """
+    import time
+    
+    try:
+        # Analyze the display.json file for this execution
+        display_json_path = os.path.join(timestamp_dir, "display.json")
+        
+        # Wait for file to be fully written
+        max_wait_time = 10  # Maximum wait time in seconds
+        wait_interval = 0.5  # Check every 0.5 seconds
+        waited_time = 0
+        
+        while waited_time < max_wait_time:
+            if os.path.exists(display_json_path):
+                # Check if file is still being written by monitoring its size
+                try:
+                    size1 = os.path.getsize(display_json_path)
+                    time.sleep(wait_interval)
+                    size2 = os.path.getsize(display_json_path)
+                    
+                    # If file size hasn't changed in the last 0.5 seconds, it's likely complete
+                    if size1 == size2:
+                        logger.info(f"Display.json file appears to be complete (size: {size1} bytes)")
+                        break
+                    else:
+                        logger.info(f"Display.json file still being written (size changed from {size1} to {size2} bytes)")
+                        waited_time += wait_interval
+                        continue
+                except OSError:
+                    # File might be temporarily inaccessible
+                    time.sleep(wait_interval)
+                    waited_time += wait_interval
+                    continue
+            else:
+                logger.info(f"Waiting for display.json file to be created... ({waited_time:.1f}s)")
+                time.sleep(wait_interval)
+                waited_time += wait_interval
+        
+        if os.path.exists(display_json_path):
+            logger.info(f"Auto-analyzing execution statistics from: {display_json_path}")
+            
+            # Analyze the single display.json file
+            result = analyze_display_json(display_json_path)
+            
+            if result:
+                # Format and log the statistics
+                output_line = format_output_line(result)
+                logger.info("=" * 80)
+                logger.info("EXECUTION STATISTICS:")
+                logger.info("Steps, Duration (seconds), (Input Tokens, Output Tokens, Total Tokens), Cost")
+                logger.info("=" * 80)
+                logger.info(output_line)
+                logger.info("=" * 80)
+                
+                # Also print to console for immediate visibility
+                print("\n" + "=" * 80)
+                print("EXECUTION STATISTICS:")
+                print("Steps, Duration (seconds), (Input Tokens, Output Tokens, Total Tokens), Cost")
+                print("=" * 80)
+                print(output_line)
+                print("=" * 80)
+            else:
+                logger.warning("No valid data found in display.json for analysis")
+        else:
+            logger.warning(f"Display.json file not found at: {display_json_path} after waiting {max_wait_time} seconds")
+            
+    except Exception as e:
+        logger.error(f"Error during auto-analysis: {e}")
+
+
+def run_agent_maestro(params: dict):
+    """
+    Run the maestro controller with the given instruction
+    
+    Args:
+        controller: The NewController instance to run
+        instruction: The instruction/task to execute
+        max_steps: Maximum number of steps to execute
+    """
+
+    backend = params["backend"]
+    user_query = params["query"]
+    max_steps = params["max_steps"]
+    current_platform = params["current_platform"]
+    env = params["env"]
+    env_password = params["env_password"]
+
+    import time
+    
+    logger.info(f"Starting maestro execution with instruction: {user_query}")
+    
+    total_start_time = time.time()
+    # Ensure necessary directory structure exists
+    timestamp_dir = os.path.join(log_dir, datetime_str)
+    cache_dir = os.path.join(timestamp_dir, "cache", "screens")
+    state_dir = os.path.join(timestamp_dir, "state")
+
+    os.makedirs(cache_dir, exist_ok=True)
+    os.makedirs(state_dir, exist_ok=True)
+
+    # registry = Registry(global_state)
+
+    # Initialize NewController (which includes all other components)
+    controller = MainController(
+        platform=current_platform,
+        backend=backend,
+        user_query=user_query,
+        max_steps=max_steps,
+        env=env,
+        env_password=env_password,
+        log_dir=log_dir,
+        datetime_str=datetime_str
+    )
+    
+    try:
+        # Set the user query in the controller
+        controller.execute_main_loop()
+        
+        # Check task status after execution to determine if task was successful
+        task = controller.global_state.get_task()
+        if task and task.status == "fulfilled":
+            # Task completed successfully
+            logger.info("Task completed successfully")
+            show_task_completion_notification("success")
+        elif task and task.status == "rejected":
+            # Task was rejected/failed
+            logger.info("Task was rejected/failed")
+            show_task_completion_notification("failed")
+        else:
+            # Task status unknown or incomplete
+            logger.info("Task execution completed with unknown status")
+            show_task_completion_notification("completed")
+        
+    except Exception as e:
+        logger.error(f"Error during maestro execution: {e}")
+        # Show error notification
+        show_task_completion_notification("error", str(e))
+        raise
+    
+    finally:
+        total_end_time = time.time()
+        total_duration = total_end_time - total_start_time
+        logger.info(f"Total execution time: {total_duration:.2f} seconds")
+        
+        # Auto-analyze execution statistics after task completion
+        auto_analyze_execution(timestamp_dir)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='Maestro CLI Application')
+    parser.add_argument(
+        '--backend',
+        type=str,
+        default='lybic',
+        help='Backend to use (e.g., lybic, pyautogui, pyautogui_vmware)')
+    parser.add_argument('--query',
+                        type=str,
+                        default='',
+                        help='Initial query to execute')
+    parser.add_argument('--max-steps',
+                        type=int,
+                        default=50,
+                        help='Maximum number of steps to execute (default: 50)')
+    parser.add_argument(
+        '--lybic-sid',
+        type=str,
+        default=None,
+        help='Lybic precreated sandbox ID (if not provided, will use LYBIC_PRECREATE_SID environment variable)')
+    args = parser.parse_args()
+
+    env = None
+    env_password = ""
+
+    # Set platform to Windows if backend is lybic
+    if args.backend == 'lybic':
+        current_platform = 'Windows'
+        # Initialize hardware interface
+        backend_kwargs = {"platform": current_platform}
+        if args.lybic_sid is not None:
+            backend_kwargs["precreate_sid"] = args.lybic_sid
+            logger.info(f"Using Lybic SID from command line: {args.lybic_sid}")
+        else:
+            logger.info("Using Lybic SID from environment variable LYBIC_PRECREATE_SID")
+
+    elif args.backend == 'pyautogui_vmware':
+        env_password = "password"
+        current_platform = os.getenv("USE_PRECREATE_VM", "Windows")
+        if current_platform == "Ubuntu":
+            path_to_vm = os.path.join("vmware_vm_data", "Ubuntu0", "Ubuntu0.vmx")
+        elif current_platform == "Windows":
+            path_to_vm = os.path.join("vmware_vm_data", "Windows0", "Windows0.vmx")
+        else:
+            raise ValueError(f"USE_PRECREATE_VM={current_platform} is not supported. Please use Ubuntu or Windows.")
+
+        env = DesktopEnv(
+            path_to_vm=path_to_vm,
+            provider_name="vmware", 
+            os_type=current_platform, 
+            action_space="pyautogui",
+            require_a11y_tree=False
+        )
+        env.reset()
+
+    else:
+        current_platform = platform.system()
+
+    logger.info(f"Running maestro on platform: {current_platform}")
+    logger.info(f"Using backend: {args.backend}")
+
+    logger.info("Maestro components initialized successfully")            
+
+    params = {
+        "backend": args.backend,
+        "query": '',
+        "max_steps": args.max_steps,
+        "current_platform": current_platform,
+        "env": env,
+        "env_password": env_password
+    }
+    # if query is provided, run the agent on the query
+    if args.query:
+        logger.info(f"Executing query: {args.query}")
+        params["query"] = args.query
+        run_agent_maestro(params)
+
+    else:
+        while True:
+            query = input("Query: ")
+            params["query"] = query
+            # Run the agent on your own device
+            run_agent_maestro(params)
+
+            response = input("Would you like to provide another query? (y/n): ")
+            if response.lower() != "y":
+                break
+
+
+if __name__ == "__main__":
+    """
+    python gui_agents/cli_app_maestro.py --backend lybic
+    python gui_agents/cli_app_maestro.py --backend pyautogui --max-steps 1
+    python gui_agents/cli_app_maestro.py --backend pyautogui_vmware --max-steps 1
+    python gui_agents/cli_app_maestro.py --backend lybic --max-steps 15
+    python gui_agents/cli_app_maestro.py --backend lybic --lybic-sid SBX-01K1X6ZKAERXAN73KTJ1XXJXAF
+    """
+    main()