Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies. * Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic. * Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis. * Added a **tools module** containing utility functions and tool configurations to improve code reusability. * Updated the **README** and documentation with usage examples and module descriptions. These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience. Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
2025-09-08 15:07:21 +08:00
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions
--- a/mm_agents/maestro/utils/README.md
+++ b/mm_agents/maestro/utils/README.md
@@ -0,0 +1,121 @@
+# Maestro Utilities
+
+This directory contains various utility functions for the Maestro project to improve code reusability and maintainability.
+
+## File Structure
+
+```
+gui_agents/utils/
+├── README.md           # This document
+├── file_utils.py       # File operation utilities
+├── id_utils.py         # ID generation utilities
+└── common_utils.py     # Other common utilities
+```
+
+## file_utils.py - File Operation Utilities
+
+### File Locking Mechanism
+
+```python
+from gui_agents.utils.file_utils import locked
+
+# Cross-platform file lock, supports Windows and Unix systems
+with locked(file_path, "w") as f:
+    f.write("content")
+```
+
+### Safe JSON Operations
+
+```python
+from gui_agents.utils.file_utils import safe_write_json, safe_read_json
+
+# Safely write JSON file (atomic operation)
+safe_write_json(file_path, data)
+
+# Safely read JSON file
+data = safe_read_json(file_path, default={})
+```
+
+### Safe Text Operations
+
+```python
+from gui_agents.utils.file_utils import safe_write_text, safe_read_text
+
+# Safely write text file (UTF-8 encoding)
+safe_write_text(file_path, content)
+
+# Safely read text file (automatic encoding detection)
+content = safe_read_text(file_path)
+```
+
+### File Management Tools
+
+```python
+from gui_agents.utils.file_utils import ensure_directory, backup_file
+
+# Ensure directory exists
+ensure_directory(path)
+
+# Create file backup
+backup_path = backup_file(file_path, ".backup")
+```
+
+## id_utils.py - ID Generation Utilities
+
+### UUID Generation
+
+```python
+from gui_agents.utils.id_utils import generate_uuid, generate_short_id
+
+# Generate complete UUID
+uuid_str = generate_uuid()  # "550e8400-e29b-41d4-a716-446655440000"
+
+# Generate short ID
+short_id = generate_short_id("task", 8)  # "task550e8400"
+```
+
+### Timestamp ID
+
+```python
+from gui_agents.utils.id_utils import generate_timestamp_id
+
+# Timestamp-based ID
+ts_id = generate_timestamp_id("event")  # "event1755576661494"
+```
+
+### Hash ID
+
+```python
+from gui_agents.utils.id_utils import generate_hash_id
+
+# Content hash-based ID
+hash_id = generate_hash_id("some content", "hash", 8)  # "hasha1b2c3d4"
+```
+
+### Composite ID
+
+```python
+from gui_agents.utils.id_utils import generate_composite_id
+
+# Composite ID (prefix + timestamp + UUID)
+composite_id = generate_composite_id("task", True, True, "_")  # "task_1755576661494_550e8400"
+```
+
+## Usage in NewGlobalState
+
+The new `NewGlobalState` class has been refactored to use these utility functions:
+
+```python
+from gui_agents.utils.file_utils import safe_write_json, safe_read_json
+from gui_agents.utils.id_utils import generate_uuid
+
+class NewGlobalState:
+    def __init__(self, ...):
+        self.task_id = task_id or f"task-{generate_uuid()[:8]}"
+    
+    def set_task(self, task_data):
+        safe_write_json(self.task_path, task_data)
+    
+    def get_task(self):
+        return safe_read_json(self.task_path, {})
+```
--- a/mm_agents/maestro/utils/init.py
+++ b/mm_agents/maestro/utils/init.py
--- a/mm_agents/maestro/utils/analyze_display.py
+++ b/mm_agents/maestro/utils/analyze_display.py
@@ -0,0 +1,339 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Display.json analyzer - Extract and analyze execution statistics from display.json files
+"""
+
+import json
+import os
+import glob
+import re
+from typing import Dict, List, Tuple
+
+
+def extract_cost_value(cost_str: str) -> tuple:
+    """
+    Extract numeric value and currency symbol from cost string (e.g., "0.000343￥" -> (0.000343, "￥"))
+    
+    Args:
+        cost_str: Cost string with currency symbol
+        
+    Returns:
+        Tuple of (float value, currency symbol)
+    """
+    # Extract numeric value and currency symbol
+    match = re.search(r'([\d.]+)([￥$€£¥]*)', cost_str)
+    if match:
+        value = float(match.group(1))
+        currency = match.group(2) if match.group(2) else "￥"  # Default to ￥ if no symbol found
+        return value, currency
+    return 0.0, "￥"
+
+
+def convert_currency_to_yuan(value: float, currency: str) -> float:
+    """
+    Convert different currencies to yuan (￥) for consistent cost calculation
+    
+    Args:
+        value: Cost value
+        currency: Currency symbol
+        
+    Returns:
+        Value converted to yuan
+    """
+    # Simple conversion rates (you might want to use real-time rates in production)
+    conversion_rates = {
+        "￥": 1.0,
+        "¥": 1.0,
+        "$": 7.2,  # USD to CNY (approximate)
+        "€": 7.8,  # EUR to CNY (approximate)
+        "£": 9.1,  # GBP to CNY (approximate)
+    }
+
+    rate = conversion_rates.get(currency, 1.0)
+    return value * rate
+
+
+def analyze_display_json(file_path: str) -> Dict:
+    """
+    Analyze a single display.json file and extract statistics
+    
+    Args:
+        file_path: Path to the display.json file
+        
+    Returns:
+        Dictionary containing analysis results
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as f:
+            data = json.load(f)
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+        return {}
+
+    # Initialize counters
+    action_count = 0
+    total_duration = 0
+    total_input_tokens = 0
+    total_output_tokens = 0
+    total_tokens = 0
+    total_cost = 0.0
+    currency_symbol = "￥"  # Default currency symbol
+
+    # Check if this is agents3 format (has controller.main_loop_completed)
+    is_agents3 = False
+    if 'operations' in data and 'controller' in data['operations']:
+        for operation in data['operations']['controller']:
+            if operation.get('operation') == 'main_loop_completed':
+                is_agents3 = True
+                # Extract agents3 statistics
+                action_count = operation.get('step_count', 0)
+                total_duration = int(operation.get('duration', 0))
+                break
+
+    if is_agents3:
+        # Agents3 mode analysis - extract from controller.main_loop_completed
+        if 'operations' in data and 'controller' in data['operations']:
+            for operation in data['operations']['controller']:
+                if operation.get('operation') == 'main_loop_completed':
+                    action_count = operation.get('step_count', 0)
+                    total_duration = int(operation.get('duration', 0))
+                    break
+
+        # Extract tokens and cost from all operations
+        if 'operations' in data:
+            for module_name, module_operations in data['operations'].items():
+                if isinstance(module_operations, list):
+                    for operation in module_operations:
+                        # Extract tokens if available
+                        tokens = operation.get('tokens', [0, 0, 0])
+                        if isinstance(tokens, list) and len(tokens) >= 3:
+                            total_input_tokens += tokens[0]
+                            total_output_tokens += tokens[1]
+                            total_tokens += tokens[2]
+
+                        # Extract cost if available
+                        cost_str = operation.get('cost', '0￥')
+                        cost_value, currency = extract_cost_value(cost_str)
+                        # Convert to yuan for consistent calculation
+                        cost_in_yuan = convert_currency_to_yuan(
+                            cost_value, currency)
+                        total_cost += cost_in_yuan
+                        # Always use ￥ for consistency
+                        currency_symbol = "￥"
+
+    # Check if this is a fast mode or normal mode display.json
+    elif 'operations' in data and 'agent' in data['operations']:
+        # Fast mode analysis - similar to original logic
+        if 'operations' in data and 'agent' in data['operations']:
+            ops_list = [operation for operation in data['operations']['agent']]
+            ops_list.extend([operation for operation in data['operations']['grounding']])
+            for operation in ops_list:
+                if operation.get('operation') == 'fast_planning_execution':
+                    action_count += 1
+
+                # Extract tokens
+                tokens = operation.get('tokens', [0, 0, 0])
+                if len(tokens) >= 3:
+                    total_input_tokens += tokens[0]
+                    total_output_tokens += tokens[1]
+                    total_tokens += tokens[2]
+
+                # Extract cost
+                cost_str = operation.get('cost', '0￥')
+                cost_value, currency = extract_cost_value(cost_str)
+                # Convert to yuan for consistent calculation
+                cost_in_yuan = convert_currency_to_yuan(cost_value, currency)
+                total_cost += cost_in_yuan
+                currency_symbol = "￥"  # Always use ￥ for consistency
+
+        # Extract total execution time for fast mode
+        if 'operations' in data and 'other' in data['operations']:
+            for operation in data['operations']['other']:
+                if operation.get('operation') == 'total_execution_time_fast':
+                    total_duration = int(operation.get('duration', 0))
+                    break
+    else:
+        # Normal mode analysis - analyze specific operations
+        if 'operations' in data:
+            # Define the operations to count for tokens and cost
+            token_cost_operations = {
+                'formulate_query', 'retrieve_narrative_experience', 'retrieve_knowledge',
+                'knowledge_fusion', 'subtask_planner', 'generated_dag', 'reflection',
+                'episode_summarization', 'narrative_summarization', 'Worker.retrieve_episodic_experience',
+                'action_plan', 'grounding_model_response'
+            }
+
+            # Count hardware operations as steps
+            if 'hardware' in data['operations']:
+                action_count = len(data['operations']['hardware'])
+
+            # Extract tokens and cost from specific operations across all modules
+            for module_name, module_operations in data['operations'].items():
+                if isinstance(module_operations, list):
+                    for operation in module_operations:
+                        operation_type = operation.get('operation', '')
+
+                        # Only count tokens and cost for specified operations
+                        if operation_type in token_cost_operations:
+                            # Extract tokens if available
+                            tokens = operation.get('tokens', [0, 0, 0])
+                            if isinstance(tokens, list) and len(tokens) >= 3:
+                                total_input_tokens += tokens[0]
+                                total_output_tokens += tokens[1]
+                                total_tokens += tokens[2]
+
+                            # Extract cost if available
+                            cost_str = operation.get('cost', '0￥')
+                            cost_value, currency = extract_cost_value(cost_str)
+                            # Convert to yuan for consistent calculation
+                            cost_in_yuan = convert_currency_to_yuan(cost_value, currency)
+                            total_cost += cost_in_yuan
+                            # Always use ￥ for consistency
+                            currency_symbol = "￥"
+
+            # Extract total execution time for normal mode
+            if 'other' in data['operations']:
+                for operation in data['operations']['other']:
+                    if operation.get('operation') == 'total_execution_time':
+                        total_duration = int(operation.get('duration', 0))
+                        break
+
+    return {
+        'action_count': action_count,
+        'total_duration': total_duration,
+        'total_input_tokens': total_input_tokens,
+        'total_output_tokens': total_output_tokens,
+        'total_tokens': total_tokens,
+        'total_cost': total_cost,
+        'currency_symbol': currency_symbol
+    }
+
+
+def analyze_folder(folder_path: str) -> List[Dict]:
+    """
+    Analyze all display.json files in a folder
+    
+    Args:
+        folder_path: Path to the folder containing display.json files
+        
+    Returns:
+        List of analysis results for each file
+    """
+    results = []
+
+    # Find all display.json files recursively
+    pattern = os.path.join(folder_path, "**", "display.json")
+    display_files = glob.glob(pattern, recursive=True)
+
+    if not display_files:
+        print(f"No display.json files found in {folder_path}")
+        return results
+
+    print(f"Found {len(display_files)} display.json files")
+
+    for file_path in display_files:
+        print(f"Analyzing: {file_path}")
+        result = analyze_display_json(file_path)
+        if result:
+            result['file_path'] = file_path
+            results.append(result)
+
+    return results
+
+
+def aggregate_results(results: List[Dict]) -> Dict:
+    """
+    Aggregate results from multiple files
+    
+    Args:
+        results: List of analysis results
+        
+    Returns:
+        Aggregated statistics
+    """
+    if not results:
+        return {}
+
+    total_fast_actions = sum(r['action_count'] for r in results)
+    total_duration = max(r['total_duration'] for r in results) if results else 0
+    total_input_tokens = sum(r['total_input_tokens'] for r in results)
+    total_output_tokens = sum(r['total_output_tokens'] for r in results)
+    total_tokens = sum(r['total_tokens'] for r in results)
+    total_cost = sum(r['total_cost'] for r in results)
+
+    # Use the currency symbol from the first result, or default to ￥
+    currency_symbol = results[0].get('currency_symbol', '￥') if results else '￥'
+
+    return {
+        'total_fast_actions': total_fast_actions,
+        'total_duration': total_duration,
+        'total_input_tokens': total_input_tokens,
+        'total_output_tokens': total_output_tokens,
+        'total_tokens': total_tokens,
+        'total_cost': total_cost,
+        'currency_symbol': currency_symbol
+    }
+
+
+def format_output_line(stats: Dict) -> str:
+    """
+    Format statistics into a single output line
+    
+    Args:
+        stats: Statistics dictionary
+        
+    Returns:
+        Formatted output line
+    """
+    if not stats:
+        return "No data available"
+
+    # Format: steps, duration (seconds), tokens, cost
+    steps = stats.get('action_count', 0)
+    duration = stats.get('total_duration', 0)
+    tokens = (stats.get('total_input_tokens', 0),stats.get('total_output_tokens', 0),stats.get('total_tokens', 0))
+    cost = stats.get('total_cost', 0.0)
+
+    return f"{steps}, {duration}, {tokens}, {cost:.4f}{stats.get('currency_symbol', '￥')}"
+
+
+def main():
+    """
+    Main function to analyze display.json files
+    """
+    import sys
+
+    if len(sys.argv) < 2:
+        print("Usage: python analyze_display.py <folder_path>")
+        print("Example: python analyze_display.py lybicguiagents/runtime")
+        return
+
+    folder_path = sys.argv[1]
+
+    if not os.path.exists(folder_path):
+        print(f"Folder not found: {folder_path}")
+        return
+
+    # Analyze all display.json files in the folder
+    results = analyze_folder(folder_path)
+
+    if not results:
+        print("No valid display.json files found")
+        return
+
+    # Aggregate results
+    aggregated_stats = aggregate_results(results)
+
+    # Print the required single line output
+    print("\nStatistics:")
+    print("-" * 80)
+    print("Steps, Duration (seconds), (Input Tokens, Output Tokens, Total Tokens), Cost")
+    print("-" * 80)
+    output_line = format_output_line(aggregated_stats)
+    print(output_line)
+    print("-" * 80)
+
+
+if __name__ == "__main__":
+    main()
--- a/mm_agents/maestro/utils/common_utils.py
+++ b/mm_agents/maestro/utils/common_utils.py
@@ -0,0 +1,577 @@
+import json
+import re
+from typing import List
+import time
+import tiktoken
+import numpy as np
+import os
+import platform
+import io
+from PIL import Image
+import logging
+
+from typing import Tuple, List, Union, Dict, Optional
+
+from pydantic import BaseModel, ValidationError
+
+import pickle
+
+
+class Node(BaseModel):
+    name: str
+    info: str
+    # New fields for failed task analysis
+    assignee_role: Optional[str] = None
+    error_type: Optional[str] = None  # Error type: UI_ERROR, EXECUTION_ERROR, PLANNING_ERROR, etc.
+    error_message: Optional[str] = None  # Specific error message
+    failure_count: Optional[int] = 0  # Failure count
+    last_failure_time: Optional[str] = None  # Last failure time
+    suggested_action: Optional[str] = None  # Suggested repair action
+
+
+class Dag(BaseModel):
+    nodes: List[Node]
+    edges: List[List[Node]]
+
+class SafeLoggingFilter(logging.Filter):
+    """
+    Safe logging filter that prevents logging format errors
+    Handles cases where log message format strings don't match arguments
+    """
+    
+    def filter(self, record):
+        """
+        Filter log records to prevent format errors
+        """
+        try:
+            # Try to format the message to catch format errors early
+            if hasattr(record, 'msg') and hasattr(record, 'args') and record.args:
+                try:
+                    # Test if the message can be formatted with the provided args
+                    if isinstance(record.msg, str) and '%s' in record.msg:
+                        # Count %s placeholders in the message
+                        placeholder_count = record.msg.count('%s')
+                        args_count = len(record.args)
+                        
+                        if placeholder_count != args_count:
+                            # Mismatch detected, create safe message
+                            record.msg = f"[Format mismatch prevented] Msg: {record.msg[:100]}{'...' if len(str(record.msg)) > 100 else ''}, Args count: {args_count}"
+                            record.args = ()
+                            return True
+                    
+                    # Test if the message can be formatted with the provided args
+                    _ = record.msg % record.args
+                except (TypeError, ValueError) as e:
+                    # If formatting fails, create a safe message
+                    record.msg = f"[Logging format error prevented] Original message: {str(record.msg)[:100]}{'...' if len(str(record.msg)) > 100 else ''}, Args: {record.args}"
+                    record.args = ()
+            return True
+        except Exception as e:
+            # If anything goes wrong, allow the record through but with a safe message
+            record.msg = f"[Logging filter error: {e}] Original message could not be processed safely"
+            record.args = ()
+            return True
+
+class ImageDataFilter(logging.Filter):
+    """
+    Custom log filter for filtering log records containing image binary data
+    Specifically designed to filter image data in multimodal model API calls
+    """
+    
+    # Image data characteristic identifiers
+    IMAGE_INDICATORS = [
+        'data:image',           # data URL format
+        'iVBORw0KGgo',         # PNG base64 beginning
+        '/9j/',                # JPEG base64 beginning
+        'R0lGOD',              # GIF base64 beginning
+        'UklGR',               # WEBP base64 beginning
+        'Qk0',                 # BMP base64 beginning
+    ]
+    
+    # Binary file headers
+    BINARY_HEADERS = [
+        b'\xff\xd8\xff',       # JPEG file header
+        b'\x89PNG\r\n\x1a\n',  # PNG file header
+        b'GIF87a',             # GIF87a file header
+        b'GIF89a',             # GIF89a file header
+        b'RIFF',               # WEBP/WAV file header
+        b'BM',                 # BMP file header
+    ]
+    
+    def filter(self, record):
+        """
+        Filter image data from log records
+        """
+        try:
+            # Process log message
+            if hasattr(record, 'msg') and record.msg:
+                record.msg = self._filter_message(record.msg)
+            
+            # Process log arguments
+            if hasattr(record, 'args') and record.args:
+                record.args = self._filter_args(record.args)
+                
+        except Exception as e:
+            # If filtering process fails, log error but don't block log output
+            record.msg = f"[Log filter error: {e}] Original message may contain image data"
+            record.args = ()
+        
+        return True
+    
+    def _filter_message(self, msg):
+        """
+        Filter image data from messages
+        """
+        msg_str = str(msg)
+        
+        # If message is very long, it may contain image data
+        if len(msg_str) > 5000:  # Lower threshold to 5KB
+            # Check if contains image data characteristics
+            if self._contains_image_data(msg_str):
+                return f"[LLM Call Log] Contains image data (size: {len(msg_str)} characters) - filtered"
+            
+            # Check if contains binary data characteristics
+            if self._contains_binary_data(msg_str):
+                return f"[LLM Call Log] Contains binary data (size: {len(msg_str)} characters) - filtered"
+        
+        return msg
+    
+    def _filter_args(self, args):
+        """
+        Filter image data from arguments
+        """
+        filtered_args = []
+        
+        for arg in args:
+            if isinstance(arg, (bytes, bytearray)):
+                # Process binary data
+                if len(arg) > 1000:  # Binary data larger than 1KB
+                    if self._is_image_binary(arg):
+                        filtered_args.append(f"[Image binary data filtered, size: {len(arg)} bytes]")
+                    else:
+                        filtered_args.append(f"[Binary data filtered, size: {len(arg)} bytes]")
+                else:
+                    filtered_args.append(arg)
+            
+            elif isinstance(arg, str):
+                # Process string data
+                if len(arg) > 5000:  # Strings larger than 5KB
+                    if self._contains_image_data(arg):
+                        filtered_args.append(f"[Image string data filtered, size: {len(arg)} characters]")
+                    else:
+                        filtered_args.append(arg)
+                else:
+                    filtered_args.append(arg)
+            
+            else:
+                # Keep other data types directly
+                filtered_args.append(arg)
+        
+        return tuple(filtered_args)
+    
+    def _contains_image_data(self, text):
+        """
+        Check if text contains image data
+        """
+        text_lower = text.lower()
+        return any(indicator in text_lower for indicator in self.IMAGE_INDICATORS)
+    
+    def _contains_binary_data(self, text):
+        """
+        Check if text contains large amounts of binary data
+        """
+        # Check if contains large amounts of non-ASCII characters (possibly base64-encoded binary data)
+        non_ascii_count = sum(1 for char in text if ord(char) > 127)
+        non_ascii_ratio = non_ascii_count / len(text) if len(text) > 0 else 0
+        
+        # If non-ASCII character ratio exceeds 10%, it might be binary data
+        return non_ascii_ratio > 0.1
+    
+    def _is_image_binary(self, data):
+        """
+        Check if binary data is an image
+        """
+        if len(data) < 10:
+            return False
+        
+        # Check file headers
+        for header in self.BINARY_HEADERS:
+            if data.startswith(header):
+                return True
+        
+        return False
+
+NUM_IMAGE_TOKEN = 1105  # Value set of screen of size 1920x1080 for openai vision
+
+def calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]:
+
+    num_input_images = 0
+    output_message = messages[-1]
+
+    input_message = messages[:-1]
+
+    input_string = """"""
+    for message in input_message:
+        input_string += message["content"][0]["text"] + "\n"
+        if len(message["content"]) > 1:
+            num_input_images += 1
+
+    input_text_tokens = get_input_token_length(input_string)
+
+    input_image_tokens = num_image_token * num_input_images
+
+    output_tokens = get_input_token_length(output_message["content"][0]["text"])
+
+    return (input_text_tokens + input_image_tokens), output_tokens
+
+def parse_dag(text):
+    """
+    Try extracting JSON from <json>…</json> tags first;
+    if not found, try ```json … ``` Markdown fences.
+    If both fail, try to parse the entire text as JSON.
+    """
+    logger = logging.getLogger("desktopenv.agent")
+
+    def _extract(pattern):
+        m = re.search(pattern, text, re.DOTALL)
+        return m.group(1).strip() if m else None
+
+    # 1) look for <json>…</json>
+    json_str = _extract(r"<json>(.*?)</json>")
+    # 2) fallback to ```json … ```
+    if json_str is None:
+        json_str = _extract(r"```json\s*(.*?)\s*```")
+        if json_str is None:
+            # 3) try other possible code block formats
+            json_str = _extract(r"```\s*(.*?)\s*```")
+
+    # 4) if still not found, try to parse the entire text
+    if json_str is None:
+        logger.warning("JSON markers not found, attempting to parse entire text")
+        json_str = text.strip()
+
+    # Log the extracted JSON string
+    logger.debug(f"Extracted JSON string: {json_str[:100]}...")
+
+    try:
+        # Try to parse as JSON directly
+        payload = json.loads(json_str)
+    except json.JSONDecodeError as e:
+        logger.error(f"JSON parsing error: {e}")
+        
+        # Try to fix common JSON format issues
+        try:
+            # Replace single quotes with double quotes
+            fixed_json = json_str.replace("'", "\"")
+            payload = json.loads(fixed_json)
+            logger.info("Successfully fixed JSON by replacing single quotes with double quotes")
+        except json.JSONDecodeError:
+            # Try to find and extract possible JSON objects
+            try:
+                # Look for content between { and }
+                match = re.search(r"\{(.*)\}", json_str, re.DOTALL)
+                if match:
+                    fixed_json = "{" + match.group(1) + "}"
+                    payload = json.loads(fixed_json)
+                    logger.info("Successfully fixed JSON by extracting JSON object")
+                else:
+                    logger.error("Unable to fix JSON format")
+                    return None
+            except Exception:
+                logger.error("All JSON fixing attempts failed")
+        return None
+
+    # Check if payload contains dag key
+    if "dag" not in payload:
+        logger.warning("'dag' key not found in JSON, attempting to use entire JSON object")
+        # If no dag key, try to use the entire payload
+        try:
+            # Check if payload directly conforms to Dag structure
+            if "nodes" in payload and "edges" in payload:
+                return Dag(**payload)
+            else:
+                # Iterate through top-level keys to find possible dag structure
+                for key, value in payload.items():
+                    if isinstance(value, dict) and "nodes" in value and "edges" in value:
+                        logger.info(f"Found DAG structure in key '{key}'")
+                        return Dag(**value)
+                
+                logger.error("Could not find valid DAG structure in JSON")
+                return None
+        except ValidationError as e:
+            logger.error(f"Data structure validation error: {e}")
+        return None
+
+    # Normal case, use value of dag key
+    try:
+        return Dag(**payload["dag"])
+    except ValidationError as e:
+        logger.error(f"DAG data structure validation error: {e}")
+        return None
+    except Exception as e:
+        logger.error(f"Unknown error parsing DAG: {e}")
+        return None
+
+
+def parse_single_code_from_string(input_string):
+    input_string = input_string.strip()
+    if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
+        return input_string.strip()
+
+    pattern = r"```(?:\w+\s+)?(.*?)```"
+    matches = re.findall(pattern, input_string, re.DOTALL)
+    codes = []
+    for match in matches:
+        match = match.strip()
+        commands = ["WAIT", "DONE", "FAIL"]
+        if match in commands:
+            codes.append(match.strip())
+        elif match.split("\n")[-1] in commands:
+            if len(match.split("\n")) > 1:
+                codes.append("\n".join(match.split("\n")[:-1]))
+            codes.append(match.split("\n")[-1])
+        else:
+            codes.append(match)
+    if len(codes) > 0:
+        return codes[0]
+    # The pattern matches function calls with balanced parentheses and quotes
+    code_match = re.search(r"(\w+\.\w+\((?:[^()]*|\([^()]*\))*\))", input_string)
+    if code_match:
+        return code_match.group(1)
+    lines = [line.strip() for line in input_string.splitlines() if line.strip()]
+    if lines:
+        return lines[0]
+    return "fail"
+
+
+def get_input_token_length(input_string):
+    enc = tiktoken.encoding_for_model("gpt-4")
+    tokens = enc.encode(input_string)
+    return len(tokens)
+
+def parse_screenshot_analysis(action_plan: str) -> str:
+    """Parse the Screenshot Analysis section from the LLM response.
+    
+    Args:
+        action_plan: The raw LLM response text
+        
+    Returns:
+        The screenshot analysis text, or empty string if not found
+    """
+    try:
+        # Look for Screenshot Analysis section
+        if "(Screenshot Analysis)" in action_plan:
+            # Find the start of Screenshot Analysis section
+            start_idx = action_plan.find("(Screenshot Analysis)")
+            # Find the next section marker
+            next_sections = ["(Next Action)", "(Grounded Action)", "(Previous action verification)"]
+            end_idx = len(action_plan)
+            for section in next_sections:
+                section_idx = action_plan.find(section, start_idx + 1)
+                if section_idx != -1 and section_idx < end_idx:
+                    end_idx = section_idx
+            
+            # Extract the content between markers
+            analysis_start = start_idx + len("(Screenshot Analysis)")
+            analysis_text = action_plan[analysis_start:end_idx].strip()
+            return analysis_text
+        return ""
+    except Exception as e:
+        return ""
+
+def parse_technician_screenshot_analysis(command_plan: str) -> str:
+    """Parse the Screenshot Analysis section from the technician LLM response.
+    
+    Args:
+        command_plan: The raw LLM response text
+        
+    Returns:
+        The screenshot analysis text, or empty string if not found
+    """
+    try:
+        # Look for Screenshot Analysis section
+        if "(Screenshot Analysis)" in command_plan:
+            # Find the start of Screenshot Analysis section
+            start_idx = command_plan.find("(Screenshot Analysis)")
+            # Find the next section marker
+            next_sections = ["(Next Action)"]
+            end_idx = len(command_plan)
+            for section in next_sections:
+                section_idx = command_plan.find(section, start_idx + 1)
+                if section_idx != -1 and section_idx < end_idx:
+                    end_idx = section_idx
+            
+            # Extract the content between markers
+            analysis_start = start_idx + len("(Screenshot Analysis)")
+            analysis_text = command_plan[analysis_start:end_idx].strip()
+            return analysis_text
+        return ""
+    except Exception as e:
+        return ""
+
+def sanitize_code(code):
+    # This pattern captures the outermost double-quoted text
+    if "\n" in code:
+        pattern = r'(".*?")'
+        # Find all matches in the text
+        matches = re.findall(pattern, code, flags=re.DOTALL)
+        if matches:
+            # Replace the first occurrence only
+            first_match = matches[0]
+            code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1)
+    return code
+
+
+def extract_first_agent_function(code_string):
+    # Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses
+    pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)'
+
+    # Find all matches in the string
+    matches = re.findall(pattern, code_string)
+
+    # Return the first match if found, otherwise return None
+    return matches[0] if matches else None
+
+
+def load_knowledge_base(kb_path: str) -> Dict:
+    try:
+        with open(kb_path, "r") as f:
+            return json.load(f)
+    except Exception as e:
+        print(f"Error loading knowledge base: {e}")
+        return {}
+
+
+def clean_empty_embeddings(embeddings: Dict) -> Dict:
+    to_delete = []
+    for k, v in embeddings.items():
+        arr = np.array(v)
+        if arr.size == 0 or arr.shape == () or (
+            isinstance(v, list) and v and isinstance(v[0], str) and v[0].startswith('Error:')
+        ) or (isinstance(v, str) and v.startswith('Error:')):
+            to_delete.append(k)
+    for k in to_delete:
+        del embeddings[k]
+    return embeddings
+
+
+def load_embeddings(embeddings_path: str) -> Dict:
+    try:
+        with open(embeddings_path, "rb") as f:
+            embeddings = pickle.load(f)
+        embeddings = clean_empty_embeddings(embeddings)
+        return embeddings
+    except Exception as e:
+        # print(f"Error loading embeddings: {e}")
+        print(f"Empty embeddings file: {embeddings_path}")
+        return {}
+
+
+def save_embeddings(embeddings_path: str, embeddings: Dict):
+    try:
+        import os
+        os.makedirs(os.path.dirname(embeddings_path), exist_ok=True)
+        with open(embeddings_path, "wb") as f:
+            pickle.dump(embeddings, f)
+    except Exception as e:
+        print(f"Error saving embeddings: {e}")
+
+def agent_log_to_string(agent_log: List[Dict]) -> str:
+    """
+    Converts a list of agent log entries into a single string for LLM consumption.
+
+    Args:
+        agent_log: A list of dictionaries, where each dictionary is an agent log entry.
+
+    Returns:
+        A formatted string representing the agent log.
+    """
+    if not agent_log:
+        return "No agent log entries yet."
+
+    log_strings = ["[AGENT LOG]"]
+    for entry in agent_log:
+        entry_id = entry.get("id", "N/A")
+        entry_type = entry.get("type", "N/A").capitalize()
+        content = entry.get("content", "")
+        log_strings.append(f"[Entry {entry_id} - {entry_type}] {content}")
+
+    return "\n".join(log_strings)
+
+
+def show_task_completion_notification(task_status: str, error_message: str = ""):
+    """
+    Show a popup notification for task completion status.
+    
+    Args:
+        task_status: Task status, supports 'success', 'failed', 'completed', 'error'
+        error_message: Error message (used only when status is 'error')
+    """
+    try:
+        current_platform = platform.system()
+        
+        if task_status == "success":
+            title = "Maestro"
+            message = "Task Completed Successfully"
+            dialog_type = "info"
+        elif task_status == "failed":
+            title = "Maestro"
+            message = "Task Failed/Rejected"
+            dialog_type = "error"
+        elif task_status == "completed":
+            title = "Maestro"
+            message = "Task Execution Completed"
+            dialog_type = "info"
+        elif task_status == "error":
+            title = "Maestro Error"
+            message = f"Task Execution Error: {error_message[:100] if error_message else 'Unknown error'}"
+            dialog_type = "error"
+        else:
+            title = "Maestro"
+            message = "Task Execution Completed"
+            dialog_type = "info"
+        
+        if current_platform == "Darwin":
+            # macOS
+            os.system(
+                f'osascript -e \'display dialog "{message}" with title "{title}" buttons "OK" default button "OK"\''
+            )
+        elif current_platform == "Linux":
+            # Linux
+            if dialog_type == "error":
+                os.system(
+                    f'zenity --error --title="{title}" --text="{message}" --width=300 --height=150'
+                )
+            else:
+                os.system(
+                    f'zenity --info --title="{title}" --text="{message}" --width=200 --height=100'
+                )
+        elif current_platform == "Windows":
+            # Windows
+            os.system(
+                f'msg %username% "{message}"'
+            )
+        else:
+            print(f"\n[{title}] {message}")
+            
+    except Exception as e:
+        print(f"\n[Agents3] Failed to show notification: {e}")
+        print(f"[Agents3] {message}")
+
+def screenshot_bytes_to_pil_image(screenshot_bytes: bytes) -> Optional[Image.Image]:
+    """
+    Convert the bytes data of obs["screenshot"] to a PIL Image object, preserving the original size
+    
+    Args:
+        screenshot_bytes: The bytes data of the screenshot
+    
+    Returns:
+        PIL Image object, or None if conversion fails
+    """
+    try:
+        # Create PIL Image object directly from bytes
+        image = Image.open(io.BytesIO(screenshot_bytes))
+        return image
+    except Exception as e:
+        raise RuntimeError(f"Failed to convert screenshot bytes to PIL Image: {e}")
+
--- a/mm_agents/maestro/utils/display_viewer.py
+++ b/mm_agents/maestro/utils/display_viewer.py
@@ -0,0 +1,281 @@
+#!/usr/bin/env python
+"""
+Display Viewer - Used to display operation records in display.json file in chronological order
+
+Usage:
+    python -m lybicguiagents.gui_agents.utils.display_viewer --file /path/to/display.json [--output text|json] [--filter module1,module2]
+"""
+
+import os
+import sys
+import json
+import argparse
+import datetime
+from pathlib import Path
+from typing import Dict, List, Any, Optional, Tuple
+
+
+def load_display_json(file_path: str) -> Dict:
+    """
+    Load display.json file
+    
+    Args:
+        file_path: Path to display.json file
+        
+    Returns:
+        Parsed JSON data
+    """
+    try:
+        try:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                return json.load(f)
+        except UnicodeDecodeError:
+            print(
+                f"Warning: Failed to decode '{file_path}' with utf-8, retrying with GB2312..."
+            )
+            with open(file_path, 'r', encoding='gb2312') as f:
+                return json.load(f)
+    except FileNotFoundError:
+        print(f"Error: File '{file_path}' does not exist")
+        sys.exit(1)
+    except json.JSONDecodeError:
+        print(f"Error: File '{file_path}' is not a valid JSON format")
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: An error occurred while reading file '{file_path}': {e}")
+        sys.exit(1)
+
+
+def flatten_operations(data: Dict) -> List[Dict]:
+    """
+    Flatten all module operation records into a time-sorted list
+    
+    Args:
+        data: display.json data
+        
+    Returns:
+        List of operation records sorted by time
+    """
+    all_operations = []
+
+    if "operations" not in data:
+        return all_operations
+
+    for module, operations in data["operations"].items():
+        for op in operations:
+            # Add module information
+            op["module"] = module
+            all_operations.append(op)
+
+    # Sort by timestamp
+    all_operations.sort(key=lambda x: x.get("timestamp", 0))
+
+    return all_operations
+
+
+def format_timestamp(timestamp: float) -> str:
+    """
+    Format timestamp into readable datetime
+    
+    Args:
+        timestamp: UNIX timestamp
+        
+    Returns:
+        Formatted datetime string
+    """
+    dt = datetime.datetime.fromtimestamp(timestamp)
+    return dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
+
+
+def format_duration(duration: float) -> str:
+    """
+    Format duration
+    
+    Args:
+        duration: Duration (seconds)
+        
+    Returns:
+        Formatted duration string
+    """
+    if duration < 0.001:
+        return f"{duration * 1000000:.2f}μs"
+    elif duration < 1:
+        return f"{duration * 1000:.2f}ms"
+    else:
+        return f"{duration:.2f}s"
+
+
+def format_tokens(tokens: List[int]) -> str:
+    """
+    Format tokens information
+    
+    Args:
+        tokens: [input tokens, output tokens, total tokens]
+        
+    Returns:
+        Formatted tokens string
+    """
+    if not tokens or len(tokens) < 3:
+        return "N/A"
+
+    return f"in:{tokens[0]} out:{tokens[1]} total:{tokens[2]}"
+
+
+def truncate_text(text: str, max_length: int = 100) -> str:
+    """
+    Truncate text, add ellipsis when exceeding maximum length
+    
+    Args:
+        text: Original text
+        max_length: Maximum length
+        
+    Returns:
+        Truncated text
+    """
+    if not text:
+        return ""
+
+    if isinstance(text, (dict, list)):
+        text = str(text)
+
+    if len(text) <= max_length:
+        return text
+
+    return text[:max_length - 3] + "..."
+
+
+def find_latest_display_json() -> Optional[str]:
+    """
+    Find the latest display.json file
+    
+    Returns:
+        Path to the latest display.json file, or None if not found
+    """
+    # Look for the runtime folder in the current directory
+    runtime_dir = Path("runtime")
+    if not runtime_dir.exists() or not runtime_dir.is_dir():
+        # Try looking in the parent directory
+        parent_runtime = Path("..") / "runtime"
+        if parent_runtime.exists() and parent_runtime.is_dir():
+            runtime_dir = parent_runtime
+        else:
+            return None
+
+    # Find all timestamp folders
+    timestamp_dirs = [d for d in runtime_dir.iterdir() if d.is_dir()]
+    if not timestamp_dirs:
+        return None
+
+    # Sort by folder name (timestamp) and take the latest
+    latest_dir = sorted(timestamp_dirs)[-1]
+    display_file = latest_dir / "display.json"
+
+    if display_file.exists():
+        return str(display_file)
+
+    return None
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=
+        "Display operation records in display.json file in chronological order")
+    parser.add_argument("--file", help="Path to display.json file")
+    parser.add_argument("--dir", help="Path to directory containing display.json files (recursive)")
+    parser.add_argument("--output",
+                        choices=["text", "json"],
+                        default="text",
+                        help="Output format (default: text)")
+    parser.add_argument(
+        "--filter",
+        help="Modules to filter, separated by commas (e.g., manager,worker)")
+
+    args = parser.parse_args()
+
+    if args.file and args.dir:
+        print("Error: --file and --dir cannot be used together")
+        sys.exit(1)
+
+    def process_one_file(file_path: str):
+        # Load data
+        data = load_display_json(file_path)
+        # Flatten and sort operations
+        operations = flatten_operations(data)
+        # Handle module filtering
+        filter_modules = None
+        if args.filter:
+            filter_modules = [module.strip() for module in args.filter.split(",")]
+        # Generate output content
+        output_content = ""
+        if args.output == "json":
+            # Filter operations if modules are specified
+            if filter_modules:
+                filtered_ops = [op for op in operations if op["module"] in filter_modules]
+            else:
+                filtered_ops = operations
+            output_content = json.dumps(filtered_ops, indent=2, ensure_ascii=False)
+        else:
+            # Generate text format output
+            output_lines = []
+            for i, op in enumerate(operations):
+                # Skip modules that don't match the filter if a filter is specified
+                if filter_modules and op["module"] not in filter_modules:
+                    continue
+                module = op["module"]
+                operation = op.get("operation", "unknown")
+                timestamp = format_timestamp(op.get("timestamp", 0))
+                # Output basic information
+                output_lines.append(f"{i+1:3d} | {timestamp} | {module:10} | {operation}")
+                # Output detailed information
+                if "duration" in op:
+                    output_lines.append(f"     └─ Duration: {format_duration(op['duration'])}")
+                if "tokens" in op:
+                    output_lines.append(f"     └─ Tokens: {format_tokens(op['tokens'])}")
+                if "cost" in op:
+                    output_lines.append(f"     └─ Cost: {op['cost']}")
+                if "content" in op:
+                    content = op["content"]
+                    output_lines.append(f"     └─ Content: {content}")
+                if "status" in op:
+                    output_lines.append(f"     └─ Status: {op['status']}")
+                output_lines.append("")
+            output_content = "\n".join(output_lines)
+        # Write output to file
+        input_path = Path(file_path)
+        output_filename = f"display_viewer_output_{args.output}.txt"
+        output_path = input_path.parent / output_filename
+        try:
+            with open(output_path, 'w', encoding='utf-8') as f:
+                f.write(output_content)
+            print(f"Output written to: {output_path}")
+        except Exception as e:
+            print(f"Error writing output file: {e}")
+            sys.exit(1)
+
+    if args.dir:
+        for root, dirs, files in os.walk(args.dir):
+            for file in files:
+                if file == "display.json":
+                    file_path = os.path.join(root, file)
+                    print(f"Processing: {file_path}")
+                    process_one_file(file_path)
+        return
+
+    file_path = args.file
+    if not file_path:
+        file_path = find_latest_display_json()
+        if not file_path:
+            print(
+                "Error: Cannot find display.json file, please specify file path using --file parameter"
+            )
+            sys.exit(1)
+        print(f"Using the latest display.json file: {file_path}")
+    process_one_file(file_path)
+
+
+if __name__ == "__main__":
+    """
+    python display_viewer.py --file 
+    python display_viewer.py --dir 
+    """
+    main()
--- a/mm_agents/maestro/utils/embedding_manager.py
+++ b/mm_agents/maestro/utils/embedding_manager.py
@@ -0,0 +1,53 @@
+import numpy as np
+from ..utils.common_utils import (
+    load_embeddings,
+    save_embeddings,
+)
+import os
+
+# List all embeddings' keys and their shapes
+def list_embeddings(embeddings_path: str):
+    if not os.path.exists(embeddings_path):
+        print(f"[EmbeddingManager] File not found: {embeddings_path}")
+        return {}
+    embeddings = load_embeddings(embeddings_path)
+    info = {}
+    for k, v in embeddings.items():
+        arr = np.array(v)
+        info[k] = {'shape': arr.shape, 'preview': arr.flatten()[:5].tolist()}
+    return info
+
+# Delete a specific embedding by key
+def delete_embedding(embeddings_path: str, key: str) -> bool:
+    if not os.path.exists(embeddings_path):
+        print(f"[EmbeddingManager] File not found: {embeddings_path}")
+        return False
+    embeddings = load_embeddings(embeddings_path)
+    if key not in embeddings:
+        print(f"[EmbeddingManager] Key not found: {key}")
+        return False
+    del embeddings[key]
+    save_embeddings(embeddings_path, embeddings)
+    print(f"[EmbeddingManager] Deleted embedding for key: {key}")
+    return True
+
+def delete_empty_shape_embeddings(embeddings_path: str) -> int:
+    """Delete all embeddings whose value is empty (shape==0), shape==(), or content is error string, and return the number deleted."""
+    if not os.path.exists(embeddings_path):
+        print(f"[EmbeddingManager] File not found: {embeddings_path}")
+        return 0
+    embeddings = load_embeddings(embeddings_path)
+    to_delete = []
+    for k, v in embeddings.items():
+        arr = np.array(v)
+        # Delete shape==0 or shape==() or content is string/error information
+        if arr.size == 0 or arr.shape == () or (
+            isinstance(v, list) and v and isinstance(v[0], str) and v[0].startswith('Error:')
+        ) or (isinstance(v, str) and v.startswith('Error:')):
+            to_delete.append(k)
+    for k in to_delete:
+        del embeddings[k]
+        print(f"[EmbeddingManager] Deleted empty or error embedding for key: {k}")
+    if to_delete:
+        save_embeddings(embeddings_path, embeddings)
+    return len(to_delete) 
--- a/mm_agents/maestro/utils/file_utils.py
+++ b/mm_agents/maestro/utils/file_utils.py
@@ -0,0 +1,170 @@
+# file_utils.py
+import json
+import os
+import logging
+from pathlib import Path
+from contextlib import contextmanager
+from typing import Any, Dict, List, Optional
+
+logger = logging.getLogger(__name__)
+
+# ========= File Lock Tools =========
+@contextmanager
+def locked(path: Path, mode: str):
+    """File lock context manager for cross-platform compatibility"""
+    if os.name == "nt":
+        # Windows implementation
+        import msvcrt
+        import time as _t
+        
+        # Always use UTF-8 encoding for text files on Windows
+        if 'b' in mode:
+            f = open(path, mode)
+        else:
+            f = open(path, mode, encoding="utf-8")
+        try:
+            while True:
+                try:
+                    msvcrt.locking(f.fileno(), msvcrt.LK_NBLCK, 1)
+                    break
+                except OSError:
+                    _t.sleep(0.01)
+            yield f
+        finally:
+            f.seek(0)
+            msvcrt.locking(f.fileno(), msvcrt.LK_UNLCK, 1)
+            f.close()
+    else:
+        # Unix-like systems implementation
+        import fcntl
+        
+        # Always use UTF-8 encoding for text files on Unix-like systems
+        if 'b' in mode:
+            f = open(path, mode)
+        else:
+            f = open(path, mode, encoding="utf-8")
+        try:
+            fcntl.flock(f.fileno(), fcntl.LOCK_EX)
+            yield f
+        finally:
+            fcntl.flock(f.fileno(), fcntl.LOCK_UN)
+            f.close()
+
+# ========= Safe JSON Operations =========
+def safe_json_dump(data: Any, file_handle, **kwargs) -> None:
+    """Safely dump JSON data with proper encoding handling"""
+    kwargs.setdefault('ensure_ascii', False)
+    kwargs.setdefault('indent', 2)
+
+    try:
+        json.dump(data, file_handle, **kwargs)
+    except UnicodeEncodeError as e:
+        logger.warning(f"UnicodeEncodeError during JSON dump: {e}. Falling back to ASCII mode.")
+        kwargs['ensure_ascii'] = True
+        json.dump(data, file_handle, **kwargs)
+
+def safe_json_load(file_handle) -> Any:
+    """Safely load JSON data with proper encoding handling"""
+    try:
+        return json.load(file_handle)
+    except UnicodeDecodeError as e:
+        logger.warning(f"UnicodeDecodeError during JSON load: {e}. Attempting recovery.")
+        file_handle.seek(0)
+        content = file_handle.read()
+
+        # Try common encodings
+        for encoding in ['utf-8-sig', 'latin1', 'cp1252']:
+            try:
+                if isinstance(content, bytes):
+                    decoded_content = content.decode(encoding)
+                else:
+                    decoded_content = content
+                return json.loads(decoded_content)
+            except (UnicodeDecodeError, json.JSONDecodeError):
+                continue
+
+        logger.error("Failed to decode JSON with all attempted encodings. Returning empty data.")
+        return {}
+
+def safe_write_json(path: Path, data: Any) -> None:
+    """Safely write JSON data to file with atomic operation"""
+    tmp = path.with_suffix(".tmp")
+    try:
+        with locked(tmp, "w") as f:
+            safe_json_dump(data, f)
+            f.flush()
+            os.fsync(f.fileno())
+        tmp.replace(path)
+    except Exception as e:
+        logger.error(f"Failed to write JSON to {path}: {e}")
+        if tmp.exists():
+            try:
+                tmp.unlink()
+            except Exception:
+                pass
+        raise
+
+def safe_read_json(path: Path, default: Any = None) -> Any:
+    """Safely read JSON data from file"""
+    try:
+        with locked(path, "r") as f:
+            return safe_json_load(f)
+    except Exception as e:
+        logger.warning(f"Failed to read JSON from {path}: {e}")
+        return default if default is not None else []
+
+# ========= Safe Text File Operations =========
+def safe_write_text(path: Path, content: str) -> None:
+    """Safely write text to file with UTF-8 encoding"""
+    try:
+        path.write_text(content, encoding='utf-8')
+    except UnicodeEncodeError as e:
+        logger.warning(f"UnicodeEncodeError writing to {path}: {e}. Using error handling.")
+        path.write_text(content, encoding='utf-8', errors='replace')
+
+def safe_read_text(path: Path) -> str:
+    """Safely read text from file with proper encoding handling"""
+    try:
+        return path.read_text(encoding='utf-8')
+    except UnicodeDecodeError as e:
+        logger.warning(f"UnicodeDecodeError reading {path}: {e}. Trying alternative encodings.")
+        for encoding in ['utf-8-sig', 'latin1', 'cp1252', 'gbk']:
+            try:
+                return path.read_text(encoding=encoding)
+            except UnicodeDecodeError:
+                continue
+
+        logger.error(f"Failed to decode {path} with all encodings. Using error replacement.")
+        return path.read_text(encoding='utf-8', errors='replace')
+
+# ========= File Management Utilities =========
+def ensure_directory(path: Path) -> None:
+    """Ensure directory exists, create if necessary"""
+    path.mkdir(parents=True, exist_ok=True)
+
+def safe_file_operation(operation_name: str, file_path: Path, operation_func, *args, **kwargs):
+    """Generic safe file operation wrapper with error handling"""
+    try:
+        return operation_func(*args, **kwargs)
+    except FileNotFoundError:
+        logger.error(f"{operation_name}: File not found: {file_path}")
+        raise
+    except PermissionError:
+        logger.error(f"{operation_name}: Permission denied: {file_path}")
+        raise
+    except Exception as e:
+        logger.error(f"{operation_name}: Unexpected error with {file_path}: {e}")
+        raise
+
+def backup_file(file_path: Path, backup_suffix: str = ".backup") -> Path:
+    """Create a backup of a file"""
+    backup_path = file_path.with_suffix(file_path.suffix + backup_suffix)
+    try:
+        if file_path.exists():
+            import shutil
+            shutil.copy2(file_path, backup_path)
+            logger.info(f"Backup created: {backup_path}")
+        return backup_path
+    except Exception as e:
+        logger.error(f"Failed to create backup of {file_path}: {e}")
+        raise 
--- a/mm_agents/maestro/utils/id_utils.py
+++ b/mm_agents/maestro/utils/id_utils.py
@@ -0,0 +1,69 @@
+# id_utils.py
+import uuid
+import time
+import hashlib
+from typing import Optional
+
+# Module-level counter for sequential IDs
+_sequential_counter = 1
+
+def generate_uuid() -> str:
+    """Generate a random UUID string"""
+    return str(uuid.uuid4())
+
+def generate_short_id(prefix: str = "", length: int = 8) -> str:
+    """Generate a short random ID with optional prefix"""
+    # Generate UUID and take first N characters
+    short_uuid = str(uuid.uuid4()).replace("-", "")[:length]
+    return f"{prefix}{short_uuid}" if prefix else short_uuid
+
+def generate_timestamp_id(prefix: str = "") -> str:
+    """Generate ID based on current timestamp"""
+    timestamp = int(time.time() * 1000)  # milliseconds
+    return f"{prefix}{timestamp}" if prefix else str(timestamp)
+
+def generate_hash_id(content: str, prefix: str = "", length: int = 8) -> str:
+    """Generate ID based on content hash"""
+    hash_obj = hashlib.md5(content.encode('utf-8'))
+    hash_hex = hash_obj.hexdigest()[:length]
+    return f"{prefix}{hash_hex}" if prefix else hash_hex
+
+def generate_sequential_id(prefix: str = "", start: int = 1) -> str:
+    """Generate sequential ID (not thread-safe, use with caution)"""
+    global _sequential_counter
+    if start != 1:  # Reset counter if different start value
+        _sequential_counter = start
+    
+    current_id = _sequential_counter
+    _sequential_counter += 1
+    return f"{prefix}{current_id}" if prefix else str(current_id)
+
+def generate_composite_id(prefix: str = "", include_timestamp: bool = True, 
+                         include_uuid: bool = True, separator: str = "_") -> str:
+    """Generate composite ID with multiple components"""
+    parts = []
+    
+    if prefix:
+        parts.append(prefix)
+    
+    if include_timestamp:
+        parts.append(str(int(time.time() * 1000)))
+    
+    if include_uuid:
+        parts.append(str(uuid.uuid4())[:8])
+    
+    return separator.join(parts)
+
+def validate_id_format(id_string: str, expected_prefix: Optional[str] = None, 
+                      min_length: int = 1, max_length: int = 100) -> bool:
+    """Validate ID format and constraints"""
+    if not id_string or not isinstance(id_string, str):
+        return False
+    
+    if len(id_string) < min_length or len(id_string) > max_length:
+        return False
+    
+    if expected_prefix and not id_string.startswith(expected_prefix):
+        return False
+    
+    return True 
--- a/mm_agents/maestro/utils/image_axis_utils.py
+++ b/mm_agents/maestro/utils/image_axis_utils.py
@@ -0,0 +1,27 @@
+from PIL import Image
+
+
+def pad_to_square(image: Image.Image,
+                  fill_color=(0, 0, 0),
+                  padding: int = 0) -> Image.Image:
+    """
+    First make it a square, then expand the padding pixels around it.
+    """
+    width, height = image.size
+    if width == height:
+        square_img = image.copy()
+    else:
+        new_size = max(width, height)
+        square_img = Image.new(image.mode, (new_size, new_size), fill_color)
+        left = (new_size - width) // 2
+        top = (new_size - height) // 2
+        square_img.paste(image, (left, top))
+
+    if padding > 0:
+        final_size = square_img.size[0] + 2 * padding
+        padded_img = Image.new(square_img.mode, (final_size, final_size),
+                               fill_color)
+        padded_img.paste(square_img, (padding, padding))
+        return padded_img
+    else:
+        return square_img