Add multiple new modules and tools to enhance the functionality and extensibility of the Maestro project (#333)

* Added a **pyproject.toml** file to define project metadata and dependencies.
* Added **run\_maestro.py** and **osworld\_run\_maestro.py** to provide the main execution logic.
* Introduced multiple new modules, including **Evaluator**, **Controller**, **Manager**, and **Sub-Worker**, supporting task planning, state management, and data analysis.
* Added a **tools module** containing utility functions and tool configurations to improve code reusability.
* Updated the **README** and documentation with usage examples and module descriptions.

These changes lay the foundation for expanding the Maestro project’s functionality and improving the user experience.

Co-authored-by: Hiroid <guoliangxuan@deepmatrix.com>
This commit is contained in:
Hiroid
2025-09-08 15:07:21 +08:00
committed by GitHub
parent 029885e78c
commit 3a4b67304f
96 changed files with 31982 additions and 2 deletions

View File

@@ -0,0 +1,121 @@
# Maestro Utilities
This directory contains various utility functions for the Maestro project to improve code reusability and maintainability.
## File Structure
```
gui_agents/utils/
├── README.md # This document
├── file_utils.py # File operation utilities
├── id_utils.py # ID generation utilities
└── common_utils.py # Other common utilities
```
## file_utils.py - File Operation Utilities
### File Locking Mechanism
```python
from gui_agents.utils.file_utils import locked
# Cross-platform file lock, supports Windows and Unix systems
with locked(file_path, "w") as f:
f.write("content")
```
### Safe JSON Operations
```python
from gui_agents.utils.file_utils import safe_write_json, safe_read_json
# Safely write JSON file (atomic operation)
safe_write_json(file_path, data)
# Safely read JSON file
data = safe_read_json(file_path, default={})
```
### Safe Text Operations
```python
from gui_agents.utils.file_utils import safe_write_text, safe_read_text
# Safely write text file (UTF-8 encoding)
safe_write_text(file_path, content)
# Safely read text file (automatic encoding detection)
content = safe_read_text(file_path)
```
### File Management Tools
```python
from gui_agents.utils.file_utils import ensure_directory, backup_file
# Ensure directory exists
ensure_directory(path)
# Create file backup
backup_path = backup_file(file_path, ".backup")
```
## id_utils.py - ID Generation Utilities
### UUID Generation
```python
from gui_agents.utils.id_utils import generate_uuid, generate_short_id
# Generate complete UUID
uuid_str = generate_uuid() # "550e8400-e29b-41d4-a716-446655440000"
# Generate short ID
short_id = generate_short_id("task", 8) # "task550e8400"
```
### Timestamp ID
```python
from gui_agents.utils.id_utils import generate_timestamp_id
# Timestamp-based ID
ts_id = generate_timestamp_id("event") # "event1755576661494"
```
### Hash ID
```python
from gui_agents.utils.id_utils import generate_hash_id
# Content hash-based ID
hash_id = generate_hash_id("some content", "hash", 8) # "hasha1b2c3d4"
```
### Composite ID
```python
from gui_agents.utils.id_utils import generate_composite_id
# Composite ID (prefix + timestamp + UUID)
composite_id = generate_composite_id("task", True, True, "_") # "task_1755576661494_550e8400"
```
## Usage in NewGlobalState
The new `NewGlobalState` class has been refactored to use these utility functions:
```python
from gui_agents.utils.file_utils import safe_write_json, safe_read_json
from gui_agents.utils.id_utils import generate_uuid
class NewGlobalState:
def __init__(self, ...):
self.task_id = task_id or f"task-{generate_uuid()[:8]}"
def set_task(self, task_data):
safe_write_json(self.task_path, task_data)
def get_task(self):
return safe_read_json(self.task_path, {})
```

View File

View File

@@ -0,0 +1,339 @@
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Display.json analyzer - Extract and analyze execution statistics from display.json files
"""
import json
import os
import glob
import re
from typing import Dict, List, Tuple
def extract_cost_value(cost_str: str) -> tuple:
"""
Extract numeric value and currency symbol from cost string (e.g., "0.000343¥" -> (0.000343, ""))
Args:
cost_str: Cost string with currency symbol
Returns:
Tuple of (float value, currency symbol)
"""
# Extract numeric value and currency symbol
match = re.search(r'([\d.]+)([¥$€£¥]*)', cost_str)
if match:
value = float(match.group(1))
currency = match.group(2) if match.group(2) else "" # Default to ¥ if no symbol found
return value, currency
return 0.0, ""
def convert_currency_to_yuan(value: float, currency: str) -> float:
"""
Convert different currencies to yuan (¥) for consistent cost calculation
Args:
value: Cost value
currency: Currency symbol
Returns:
Value converted to yuan
"""
# Simple conversion rates (you might want to use real-time rates in production)
conversion_rates = {
"": 1.0,
"¥": 1.0,
"$": 7.2, # USD to CNY (approximate)
"": 7.8, # EUR to CNY (approximate)
"£": 9.1, # GBP to CNY (approximate)
}
rate = conversion_rates.get(currency, 1.0)
return value * rate
def analyze_display_json(file_path: str) -> Dict:
"""
Analyze a single display.json file and extract statistics
Args:
file_path: Path to the display.json file
Returns:
Dictionary containing analysis results
"""
try:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
except Exception as e:
print(f"Error reading {file_path}: {e}")
return {}
# Initialize counters
action_count = 0
total_duration = 0
total_input_tokens = 0
total_output_tokens = 0
total_tokens = 0
total_cost = 0.0
currency_symbol = "" # Default currency symbol
# Check if this is agents3 format (has controller.main_loop_completed)
is_agents3 = False
if 'operations' in data and 'controller' in data['operations']:
for operation in data['operations']['controller']:
if operation.get('operation') == 'main_loop_completed':
is_agents3 = True
# Extract agents3 statistics
action_count = operation.get('step_count', 0)
total_duration = int(operation.get('duration', 0))
break
if is_agents3:
# Agents3 mode analysis - extract from controller.main_loop_completed
if 'operations' in data and 'controller' in data['operations']:
for operation in data['operations']['controller']:
if operation.get('operation') == 'main_loop_completed':
action_count = operation.get('step_count', 0)
total_duration = int(operation.get('duration', 0))
break
# Extract tokens and cost from all operations
if 'operations' in data:
for module_name, module_operations in data['operations'].items():
if isinstance(module_operations, list):
for operation in module_operations:
# Extract tokens if available
tokens = operation.get('tokens', [0, 0, 0])
if isinstance(tokens, list) and len(tokens) >= 3:
total_input_tokens += tokens[0]
total_output_tokens += tokens[1]
total_tokens += tokens[2]
# Extract cost if available
cost_str = operation.get('cost', '0¥')
cost_value, currency = extract_cost_value(cost_str)
# Convert to yuan for consistent calculation
cost_in_yuan = convert_currency_to_yuan(
cost_value, currency)
total_cost += cost_in_yuan
# Always use ¥ for consistency
currency_symbol = ""
# Check if this is a fast mode or normal mode display.json
elif 'operations' in data and 'agent' in data['operations']:
# Fast mode analysis - similar to original logic
if 'operations' in data and 'agent' in data['operations']:
ops_list = [operation for operation in data['operations']['agent']]
ops_list.extend([operation for operation in data['operations']['grounding']])
for operation in ops_list:
if operation.get('operation') == 'fast_planning_execution':
action_count += 1
# Extract tokens
tokens = operation.get('tokens', [0, 0, 0])
if len(tokens) >= 3:
total_input_tokens += tokens[0]
total_output_tokens += tokens[1]
total_tokens += tokens[2]
# Extract cost
cost_str = operation.get('cost', '0¥')
cost_value, currency = extract_cost_value(cost_str)
# Convert to yuan for consistent calculation
cost_in_yuan = convert_currency_to_yuan(cost_value, currency)
total_cost += cost_in_yuan
currency_symbol = "" # Always use ¥ for consistency
# Extract total execution time for fast mode
if 'operations' in data and 'other' in data['operations']:
for operation in data['operations']['other']:
if operation.get('operation') == 'total_execution_time_fast':
total_duration = int(operation.get('duration', 0))
break
else:
# Normal mode analysis - analyze specific operations
if 'operations' in data:
# Define the operations to count for tokens and cost
token_cost_operations = {
'formulate_query', 'retrieve_narrative_experience', 'retrieve_knowledge',
'knowledge_fusion', 'subtask_planner', 'generated_dag', 'reflection',
'episode_summarization', 'narrative_summarization', 'Worker.retrieve_episodic_experience',
'action_plan', 'grounding_model_response'
}
# Count hardware operations as steps
if 'hardware' in data['operations']:
action_count = len(data['operations']['hardware'])
# Extract tokens and cost from specific operations across all modules
for module_name, module_operations in data['operations'].items():
if isinstance(module_operations, list):
for operation in module_operations:
operation_type = operation.get('operation', '')
# Only count tokens and cost for specified operations
if operation_type in token_cost_operations:
# Extract tokens if available
tokens = operation.get('tokens', [0, 0, 0])
if isinstance(tokens, list) and len(tokens) >= 3:
total_input_tokens += tokens[0]
total_output_tokens += tokens[1]
total_tokens += tokens[2]
# Extract cost if available
cost_str = operation.get('cost', '0¥')
cost_value, currency = extract_cost_value(cost_str)
# Convert to yuan for consistent calculation
cost_in_yuan = convert_currency_to_yuan(cost_value, currency)
total_cost += cost_in_yuan
# Always use ¥ for consistency
currency_symbol = ""
# Extract total execution time for normal mode
if 'other' in data['operations']:
for operation in data['operations']['other']:
if operation.get('operation') == 'total_execution_time':
total_duration = int(operation.get('duration', 0))
break
return {
'action_count': action_count,
'total_duration': total_duration,
'total_input_tokens': total_input_tokens,
'total_output_tokens': total_output_tokens,
'total_tokens': total_tokens,
'total_cost': total_cost,
'currency_symbol': currency_symbol
}
def analyze_folder(folder_path: str) -> List[Dict]:
"""
Analyze all display.json files in a folder
Args:
folder_path: Path to the folder containing display.json files
Returns:
List of analysis results for each file
"""
results = []
# Find all display.json files recursively
pattern = os.path.join(folder_path, "**", "display.json")
display_files = glob.glob(pattern, recursive=True)
if not display_files:
print(f"No display.json files found in {folder_path}")
return results
print(f"Found {len(display_files)} display.json files")
for file_path in display_files:
print(f"Analyzing: {file_path}")
result = analyze_display_json(file_path)
if result:
result['file_path'] = file_path
results.append(result)
return results
def aggregate_results(results: List[Dict]) -> Dict:
"""
Aggregate results from multiple files
Args:
results: List of analysis results
Returns:
Aggregated statistics
"""
if not results:
return {}
total_fast_actions = sum(r['action_count'] for r in results)
total_duration = max(r['total_duration'] for r in results) if results else 0
total_input_tokens = sum(r['total_input_tokens'] for r in results)
total_output_tokens = sum(r['total_output_tokens'] for r in results)
total_tokens = sum(r['total_tokens'] for r in results)
total_cost = sum(r['total_cost'] for r in results)
# Use the currency symbol from the first result, or default to ¥
currency_symbol = results[0].get('currency_symbol', '') if results else ''
return {
'total_fast_actions': total_fast_actions,
'total_duration': total_duration,
'total_input_tokens': total_input_tokens,
'total_output_tokens': total_output_tokens,
'total_tokens': total_tokens,
'total_cost': total_cost,
'currency_symbol': currency_symbol
}
def format_output_line(stats: Dict) -> str:
"""
Format statistics into a single output line
Args:
stats: Statistics dictionary
Returns:
Formatted output line
"""
if not stats:
return "No data available"
# Format: steps, duration (seconds), tokens, cost
steps = stats.get('action_count', 0)
duration = stats.get('total_duration', 0)
tokens = (stats.get('total_input_tokens', 0),stats.get('total_output_tokens', 0),stats.get('total_tokens', 0))
cost = stats.get('total_cost', 0.0)
return f"{steps}, {duration}, {tokens}, {cost:.4f}{stats.get('currency_symbol', '')}"
def main():
"""
Main function to analyze display.json files
"""
import sys
if len(sys.argv) < 2:
print("Usage: python analyze_display.py <folder_path>")
print("Example: python analyze_display.py lybicguiagents/runtime")
return
folder_path = sys.argv[1]
if not os.path.exists(folder_path):
print(f"Folder not found: {folder_path}")
return
# Analyze all display.json files in the folder
results = analyze_folder(folder_path)
if not results:
print("No valid display.json files found")
return
# Aggregate results
aggregated_stats = aggregate_results(results)
# Print the required single line output
print("\nStatistics:")
print("-" * 80)
print("Steps, Duration (seconds), (Input Tokens, Output Tokens, Total Tokens), Cost")
print("-" * 80)
output_line = format_output_line(aggregated_stats)
print(output_line)
print("-" * 80)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,577 @@
import json
import re
from typing import List
import time
import tiktoken
import numpy as np
import os
import platform
import io
from PIL import Image
import logging
from typing import Tuple, List, Union, Dict, Optional
from pydantic import BaseModel, ValidationError
import pickle
class Node(BaseModel):
name: str
info: str
# New fields for failed task analysis
assignee_role: Optional[str] = None
error_type: Optional[str] = None # Error type: UI_ERROR, EXECUTION_ERROR, PLANNING_ERROR, etc.
error_message: Optional[str] = None # Specific error message
failure_count: Optional[int] = 0 # Failure count
last_failure_time: Optional[str] = None # Last failure time
suggested_action: Optional[str] = None # Suggested repair action
class Dag(BaseModel):
nodes: List[Node]
edges: List[List[Node]]
class SafeLoggingFilter(logging.Filter):
"""
Safe logging filter that prevents logging format errors
Handles cases where log message format strings don't match arguments
"""
def filter(self, record):
"""
Filter log records to prevent format errors
"""
try:
# Try to format the message to catch format errors early
if hasattr(record, 'msg') and hasattr(record, 'args') and record.args:
try:
# Test if the message can be formatted with the provided args
if isinstance(record.msg, str) and '%s' in record.msg:
# Count %s placeholders in the message
placeholder_count = record.msg.count('%s')
args_count = len(record.args)
if placeholder_count != args_count:
# Mismatch detected, create safe message
record.msg = f"[Format mismatch prevented] Msg: {record.msg[:100]}{'...' if len(str(record.msg)) > 100 else ''}, Args count: {args_count}"
record.args = ()
return True
# Test if the message can be formatted with the provided args
_ = record.msg % record.args
except (TypeError, ValueError) as e:
# If formatting fails, create a safe message
record.msg = f"[Logging format error prevented] Original message: {str(record.msg)[:100]}{'...' if len(str(record.msg)) > 100 else ''}, Args: {record.args}"
record.args = ()
return True
except Exception as e:
# If anything goes wrong, allow the record through but with a safe message
record.msg = f"[Logging filter error: {e}] Original message could not be processed safely"
record.args = ()
return True
class ImageDataFilter(logging.Filter):
"""
Custom log filter for filtering log records containing image binary data
Specifically designed to filter image data in multimodal model API calls
"""
# Image data characteristic identifiers
IMAGE_INDICATORS = [
'data:image', # data URL format
'iVBORw0KGgo', # PNG base64 beginning
'/9j/', # JPEG base64 beginning
'R0lGOD', # GIF base64 beginning
'UklGR', # WEBP base64 beginning
'Qk0', # BMP base64 beginning
]
# Binary file headers
BINARY_HEADERS = [
b'\xff\xd8\xff', # JPEG file header
b'\x89PNG\r\n\x1a\n', # PNG file header
b'GIF87a', # GIF87a file header
b'GIF89a', # GIF89a file header
b'RIFF', # WEBP/WAV file header
b'BM', # BMP file header
]
def filter(self, record):
"""
Filter image data from log records
"""
try:
# Process log message
if hasattr(record, 'msg') and record.msg:
record.msg = self._filter_message(record.msg)
# Process log arguments
if hasattr(record, 'args') and record.args:
record.args = self._filter_args(record.args)
except Exception as e:
# If filtering process fails, log error but don't block log output
record.msg = f"[Log filter error: {e}] Original message may contain image data"
record.args = ()
return True
def _filter_message(self, msg):
"""
Filter image data from messages
"""
msg_str = str(msg)
# If message is very long, it may contain image data
if len(msg_str) > 5000: # Lower threshold to 5KB
# Check if contains image data characteristics
if self._contains_image_data(msg_str):
return f"[LLM Call Log] Contains image data (size: {len(msg_str)} characters) - filtered"
# Check if contains binary data characteristics
if self._contains_binary_data(msg_str):
return f"[LLM Call Log] Contains binary data (size: {len(msg_str)} characters) - filtered"
return msg
def _filter_args(self, args):
"""
Filter image data from arguments
"""
filtered_args = []
for arg in args:
if isinstance(arg, (bytes, bytearray)):
# Process binary data
if len(arg) > 1000: # Binary data larger than 1KB
if self._is_image_binary(arg):
filtered_args.append(f"[Image binary data filtered, size: {len(arg)} bytes]")
else:
filtered_args.append(f"[Binary data filtered, size: {len(arg)} bytes]")
else:
filtered_args.append(arg)
elif isinstance(arg, str):
# Process string data
if len(arg) > 5000: # Strings larger than 5KB
if self._contains_image_data(arg):
filtered_args.append(f"[Image string data filtered, size: {len(arg)} characters]")
else:
filtered_args.append(arg)
else:
filtered_args.append(arg)
else:
# Keep other data types directly
filtered_args.append(arg)
return tuple(filtered_args)
def _contains_image_data(self, text):
"""
Check if text contains image data
"""
text_lower = text.lower()
return any(indicator in text_lower for indicator in self.IMAGE_INDICATORS)
def _contains_binary_data(self, text):
"""
Check if text contains large amounts of binary data
"""
# Check if contains large amounts of non-ASCII characters (possibly base64-encoded binary data)
non_ascii_count = sum(1 for char in text if ord(char) > 127)
non_ascii_ratio = non_ascii_count / len(text) if len(text) > 0 else 0
# If non-ASCII character ratio exceeds 10%, it might be binary data
return non_ascii_ratio > 0.1
def _is_image_binary(self, data):
"""
Check if binary data is an image
"""
if len(data) < 10:
return False
# Check file headers
for header in self.BINARY_HEADERS:
if data.startswith(header):
return True
return False
NUM_IMAGE_TOKEN = 1105 # Value set of screen of size 1920x1080 for openai vision
def calculate_tokens(messages, num_image_token=NUM_IMAGE_TOKEN) -> Tuple[int, int]:
num_input_images = 0
output_message = messages[-1]
input_message = messages[:-1]
input_string = """"""
for message in input_message:
input_string += message["content"][0]["text"] + "\n"
if len(message["content"]) > 1:
num_input_images += 1
input_text_tokens = get_input_token_length(input_string)
input_image_tokens = num_image_token * num_input_images
output_tokens = get_input_token_length(output_message["content"][0]["text"])
return (input_text_tokens + input_image_tokens), output_tokens
def parse_dag(text):
"""
Try extracting JSON from <json>…</json> tags first;
if not found, try ```json … ``` Markdown fences.
If both fail, try to parse the entire text as JSON.
"""
logger = logging.getLogger("desktopenv.agent")
def _extract(pattern):
m = re.search(pattern, text, re.DOTALL)
return m.group(1).strip() if m else None
# 1) look for <json>…</json>
json_str = _extract(r"<json>(.*?)</json>")
# 2) fallback to ```json … ```
if json_str is None:
json_str = _extract(r"```json\s*(.*?)\s*```")
if json_str is None:
# 3) try other possible code block formats
json_str = _extract(r"```\s*(.*?)\s*```")
# 4) if still not found, try to parse the entire text
if json_str is None:
logger.warning("JSON markers not found, attempting to parse entire text")
json_str = text.strip()
# Log the extracted JSON string
logger.debug(f"Extracted JSON string: {json_str[:100]}...")
try:
# Try to parse as JSON directly
payload = json.loads(json_str)
except json.JSONDecodeError as e:
logger.error(f"JSON parsing error: {e}")
# Try to fix common JSON format issues
try:
# Replace single quotes with double quotes
fixed_json = json_str.replace("'", "\"")
payload = json.loads(fixed_json)
logger.info("Successfully fixed JSON by replacing single quotes with double quotes")
except json.JSONDecodeError:
# Try to find and extract possible JSON objects
try:
# Look for content between { and }
match = re.search(r"\{(.*)\}", json_str, re.DOTALL)
if match:
fixed_json = "{" + match.group(1) + "}"
payload = json.loads(fixed_json)
logger.info("Successfully fixed JSON by extracting JSON object")
else:
logger.error("Unable to fix JSON format")
return None
except Exception:
logger.error("All JSON fixing attempts failed")
return None
# Check if payload contains dag key
if "dag" not in payload:
logger.warning("'dag' key not found in JSON, attempting to use entire JSON object")
# If no dag key, try to use the entire payload
try:
# Check if payload directly conforms to Dag structure
if "nodes" in payload and "edges" in payload:
return Dag(**payload)
else:
# Iterate through top-level keys to find possible dag structure
for key, value in payload.items():
if isinstance(value, dict) and "nodes" in value and "edges" in value:
logger.info(f"Found DAG structure in key '{key}'")
return Dag(**value)
logger.error("Could not find valid DAG structure in JSON")
return None
except ValidationError as e:
logger.error(f"Data structure validation error: {e}")
return None
# Normal case, use value of dag key
try:
return Dag(**payload["dag"])
except ValidationError as e:
logger.error(f"DAG data structure validation error: {e}")
return None
except Exception as e:
logger.error(f"Unknown error parsing DAG: {e}")
return None
def parse_single_code_from_string(input_string):
input_string = input_string.strip()
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
return input_string.strip()
pattern = r"```(?:\w+\s+)?(.*?)```"
matches = re.findall(pattern, input_string, re.DOTALL)
codes = []
for match in matches:
match = match.strip()
commands = ["WAIT", "DONE", "FAIL"]
if match in commands:
codes.append(match.strip())
elif match.split("\n")[-1] in commands:
if len(match.split("\n")) > 1:
codes.append("\n".join(match.split("\n")[:-1]))
codes.append(match.split("\n")[-1])
else:
codes.append(match)
if len(codes) > 0:
return codes[0]
# The pattern matches function calls with balanced parentheses and quotes
code_match = re.search(r"(\w+\.\w+\((?:[^()]*|\([^()]*\))*\))", input_string)
if code_match:
return code_match.group(1)
lines = [line.strip() for line in input_string.splitlines() if line.strip()]
if lines:
return lines[0]
return "fail"
def get_input_token_length(input_string):
enc = tiktoken.encoding_for_model("gpt-4")
tokens = enc.encode(input_string)
return len(tokens)
def parse_screenshot_analysis(action_plan: str) -> str:
"""Parse the Screenshot Analysis section from the LLM response.
Args:
action_plan: The raw LLM response text
Returns:
The screenshot analysis text, or empty string if not found
"""
try:
# Look for Screenshot Analysis section
if "(Screenshot Analysis)" in action_plan:
# Find the start of Screenshot Analysis section
start_idx = action_plan.find("(Screenshot Analysis)")
# Find the next section marker
next_sections = ["(Next Action)", "(Grounded Action)", "(Previous action verification)"]
end_idx = len(action_plan)
for section in next_sections:
section_idx = action_plan.find(section, start_idx + 1)
if section_idx != -1 and section_idx < end_idx:
end_idx = section_idx
# Extract the content between markers
analysis_start = start_idx + len("(Screenshot Analysis)")
analysis_text = action_plan[analysis_start:end_idx].strip()
return analysis_text
return ""
except Exception as e:
return ""
def parse_technician_screenshot_analysis(command_plan: str) -> str:
"""Parse the Screenshot Analysis section from the technician LLM response.
Args:
command_plan: The raw LLM response text
Returns:
The screenshot analysis text, or empty string if not found
"""
try:
# Look for Screenshot Analysis section
if "(Screenshot Analysis)" in command_plan:
# Find the start of Screenshot Analysis section
start_idx = command_plan.find("(Screenshot Analysis)")
# Find the next section marker
next_sections = ["(Next Action)"]
end_idx = len(command_plan)
for section in next_sections:
section_idx = command_plan.find(section, start_idx + 1)
if section_idx != -1 and section_idx < end_idx:
end_idx = section_idx
# Extract the content between markers
analysis_start = start_idx + len("(Screenshot Analysis)")
analysis_text = command_plan[analysis_start:end_idx].strip()
return analysis_text
return ""
except Exception as e:
return ""
def sanitize_code(code):
# This pattern captures the outermost double-quoted text
if "\n" in code:
pattern = r'(".*?")'
# Find all matches in the text
matches = re.findall(pattern, code, flags=re.DOTALL)
if matches:
# Replace the first occurrence only
first_match = matches[0]
code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1)
return code
def extract_first_agent_function(code_string):
# Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses
pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)'
# Find all matches in the string
matches = re.findall(pattern, code_string)
# Return the first match if found, otherwise return None
return matches[0] if matches else None
def load_knowledge_base(kb_path: str) -> Dict:
try:
with open(kb_path, "r") as f:
return json.load(f)
except Exception as e:
print(f"Error loading knowledge base: {e}")
return {}
def clean_empty_embeddings(embeddings: Dict) -> Dict:
to_delete = []
for k, v in embeddings.items():
arr = np.array(v)
if arr.size == 0 or arr.shape == () or (
isinstance(v, list) and v and isinstance(v[0], str) and v[0].startswith('Error:')
) or (isinstance(v, str) and v.startswith('Error:')):
to_delete.append(k)
for k in to_delete:
del embeddings[k]
return embeddings
def load_embeddings(embeddings_path: str) -> Dict:
try:
with open(embeddings_path, "rb") as f:
embeddings = pickle.load(f)
embeddings = clean_empty_embeddings(embeddings)
return embeddings
except Exception as e:
# print(f"Error loading embeddings: {e}")
print(f"Empty embeddings file: {embeddings_path}")
return {}
def save_embeddings(embeddings_path: str, embeddings: Dict):
try:
import os
os.makedirs(os.path.dirname(embeddings_path), exist_ok=True)
with open(embeddings_path, "wb") as f:
pickle.dump(embeddings, f)
except Exception as e:
print(f"Error saving embeddings: {e}")
def agent_log_to_string(agent_log: List[Dict]) -> str:
"""
Converts a list of agent log entries into a single string for LLM consumption.
Args:
agent_log: A list of dictionaries, where each dictionary is an agent log entry.
Returns:
A formatted string representing the agent log.
"""
if not agent_log:
return "No agent log entries yet."
log_strings = ["[AGENT LOG]"]
for entry in agent_log:
entry_id = entry.get("id", "N/A")
entry_type = entry.get("type", "N/A").capitalize()
content = entry.get("content", "")
log_strings.append(f"[Entry {entry_id} - {entry_type}] {content}")
return "\n".join(log_strings)
def show_task_completion_notification(task_status: str, error_message: str = ""):
"""
Show a popup notification for task completion status.
Args:
task_status: Task status, supports 'success', 'failed', 'completed', 'error'
error_message: Error message (used only when status is 'error')
"""
try:
current_platform = platform.system()
if task_status == "success":
title = "Maestro"
message = "Task Completed Successfully"
dialog_type = "info"
elif task_status == "failed":
title = "Maestro"
message = "Task Failed/Rejected"
dialog_type = "error"
elif task_status == "completed":
title = "Maestro"
message = "Task Execution Completed"
dialog_type = "info"
elif task_status == "error":
title = "Maestro Error"
message = f"Task Execution Error: {error_message[:100] if error_message else 'Unknown error'}"
dialog_type = "error"
else:
title = "Maestro"
message = "Task Execution Completed"
dialog_type = "info"
if current_platform == "Darwin":
# macOS
os.system(
f'osascript -e \'display dialog "{message}" with title "{title}" buttons "OK" default button "OK"\''
)
elif current_platform == "Linux":
# Linux
if dialog_type == "error":
os.system(
f'zenity --error --title="{title}" --text="{message}" --width=300 --height=150'
)
else:
os.system(
f'zenity --info --title="{title}" --text="{message}" --width=200 --height=100'
)
elif current_platform == "Windows":
# Windows
os.system(
f'msg %username% "{message}"'
)
else:
print(f"\n[{title}] {message}")
except Exception as e:
print(f"\n[Agents3] Failed to show notification: {e}")
print(f"[Agents3] {message}")
def screenshot_bytes_to_pil_image(screenshot_bytes: bytes) -> Optional[Image.Image]:
"""
Convert the bytes data of obs["screenshot"] to a PIL Image object, preserving the original size
Args:
screenshot_bytes: The bytes data of the screenshot
Returns:
PIL Image object, or None if conversion fails
"""
try:
# Create PIL Image object directly from bytes
image = Image.open(io.BytesIO(screenshot_bytes))
return image
except Exception as e:
raise RuntimeError(f"Failed to convert screenshot bytes to PIL Image: {e}")

View File

@@ -0,0 +1,281 @@
#!/usr/bin/env python
"""
Display Viewer - Used to display operation records in display.json file in chronological order
Usage:
python -m lybicguiagents.gui_agents.utils.display_viewer --file /path/to/display.json [--output text|json] [--filter module1,module2]
"""
import os
import sys
import json
import argparse
import datetime
from pathlib import Path
from typing import Dict, List, Any, Optional, Tuple
def load_display_json(file_path: str) -> Dict:
"""
Load display.json file
Args:
file_path: Path to display.json file
Returns:
Parsed JSON data
"""
try:
try:
with open(file_path, 'r', encoding='utf-8') as f:
return json.load(f)
except UnicodeDecodeError:
print(
f"Warning: Failed to decode '{file_path}' with utf-8, retrying with GB2312..."
)
with open(file_path, 'r', encoding='gb2312') as f:
return json.load(f)
except FileNotFoundError:
print(f"Error: File '{file_path}' does not exist")
sys.exit(1)
except json.JSONDecodeError:
print(f"Error: File '{file_path}' is not a valid JSON format")
sys.exit(1)
except Exception as e:
print(f"Error: An error occurred while reading file '{file_path}': {e}")
sys.exit(1)
def flatten_operations(data: Dict) -> List[Dict]:
"""
Flatten all module operation records into a time-sorted list
Args:
data: display.json data
Returns:
List of operation records sorted by time
"""
all_operations = []
if "operations" not in data:
return all_operations
for module, operations in data["operations"].items():
for op in operations:
# Add module information
op["module"] = module
all_operations.append(op)
# Sort by timestamp
all_operations.sort(key=lambda x: x.get("timestamp", 0))
return all_operations
def format_timestamp(timestamp: float) -> str:
"""
Format timestamp into readable datetime
Args:
timestamp: UNIX timestamp
Returns:
Formatted datetime string
"""
dt = datetime.datetime.fromtimestamp(timestamp)
return dt.strftime("%Y-%m-%d %H:%M:%S.%f")[:-3]
def format_duration(duration: float) -> str:
"""
Format duration
Args:
duration: Duration (seconds)
Returns:
Formatted duration string
"""
if duration < 0.001:
return f"{duration * 1000000:.2f}μs"
elif duration < 1:
return f"{duration * 1000:.2f}ms"
else:
return f"{duration:.2f}s"
def format_tokens(tokens: List[int]) -> str:
"""
Format tokens information
Args:
tokens: [input tokens, output tokens, total tokens]
Returns:
Formatted tokens string
"""
if not tokens or len(tokens) < 3:
return "N/A"
return f"in:{tokens[0]} out:{tokens[1]} total:{tokens[2]}"
def truncate_text(text: str, max_length: int = 100) -> str:
"""
Truncate text, add ellipsis when exceeding maximum length
Args:
text: Original text
max_length: Maximum length
Returns:
Truncated text
"""
if not text:
return ""
if isinstance(text, (dict, list)):
text = str(text)
if len(text) <= max_length:
return text
return text[:max_length - 3] + "..."
def find_latest_display_json() -> Optional[str]:
"""
Find the latest display.json file
Returns:
Path to the latest display.json file, or None if not found
"""
# Look for the runtime folder in the current directory
runtime_dir = Path("runtime")
if not runtime_dir.exists() or not runtime_dir.is_dir():
# Try looking in the parent directory
parent_runtime = Path("..") / "runtime"
if parent_runtime.exists() and parent_runtime.is_dir():
runtime_dir = parent_runtime
else:
return None
# Find all timestamp folders
timestamp_dirs = [d for d in runtime_dir.iterdir() if d.is_dir()]
if not timestamp_dirs:
return None
# Sort by folder name (timestamp) and take the latest
latest_dir = sorted(timestamp_dirs)[-1]
display_file = latest_dir / "display.json"
if display_file.exists():
return str(display_file)
return None
def main():
parser = argparse.ArgumentParser(
description=
"Display operation records in display.json file in chronological order")
parser.add_argument("--file", help="Path to display.json file")
parser.add_argument("--dir", help="Path to directory containing display.json files (recursive)")
parser.add_argument("--output",
choices=["text", "json"],
default="text",
help="Output format (default: text)")
parser.add_argument(
"--filter",
help="Modules to filter, separated by commas (e.g., manager,worker)")
args = parser.parse_args()
if args.file and args.dir:
print("Error: --file and --dir cannot be used together")
sys.exit(1)
def process_one_file(file_path: str):
# Load data
data = load_display_json(file_path)
# Flatten and sort operations
operations = flatten_operations(data)
# Handle module filtering
filter_modules = None
if args.filter:
filter_modules = [module.strip() for module in args.filter.split(",")]
# Generate output content
output_content = ""
if args.output == "json":
# Filter operations if modules are specified
if filter_modules:
filtered_ops = [op for op in operations if op["module"] in filter_modules]
else:
filtered_ops = operations
output_content = json.dumps(filtered_ops, indent=2, ensure_ascii=False)
else:
# Generate text format output
output_lines = []
for i, op in enumerate(operations):
# Skip modules that don't match the filter if a filter is specified
if filter_modules and op["module"] not in filter_modules:
continue
module = op["module"]
operation = op.get("operation", "unknown")
timestamp = format_timestamp(op.get("timestamp", 0))
# Output basic information
output_lines.append(f"{i+1:3d} | {timestamp} | {module:10} | {operation}")
# Output detailed information
if "duration" in op:
output_lines.append(f" └─ Duration: {format_duration(op['duration'])}")
if "tokens" in op:
output_lines.append(f" └─ Tokens: {format_tokens(op['tokens'])}")
if "cost" in op:
output_lines.append(f" └─ Cost: {op['cost']}")
if "content" in op:
content = op["content"]
output_lines.append(f" └─ Content: {content}")
if "status" in op:
output_lines.append(f" └─ Status: {op['status']}")
output_lines.append("")
output_content = "\n".join(output_lines)
# Write output to file
input_path = Path(file_path)
output_filename = f"display_viewer_output_{args.output}.txt"
output_path = input_path.parent / output_filename
try:
with open(output_path, 'w', encoding='utf-8') as f:
f.write(output_content)
print(f"Output written to: {output_path}")
except Exception as e:
print(f"Error writing output file: {e}")
sys.exit(1)
if args.dir:
for root, dirs, files in os.walk(args.dir):
for file in files:
if file == "display.json":
file_path = os.path.join(root, file)
print(f"Processing: {file_path}")
process_one_file(file_path)
return
file_path = args.file
if not file_path:
file_path = find_latest_display_json()
if not file_path:
print(
"Error: Cannot find display.json file, please specify file path using --file parameter"
)
sys.exit(1)
print(f"Using the latest display.json file: {file_path}")
process_one_file(file_path)
if __name__ == "__main__":
"""
python display_viewer.py --file
python display_viewer.py --dir
"""
main()

View File

@@ -0,0 +1,53 @@
import numpy as np
from ..utils.common_utils import (
load_embeddings,
save_embeddings,
)
import os
# List all embeddings' keys and their shapes
def list_embeddings(embeddings_path: str):
if not os.path.exists(embeddings_path):
print(f"[EmbeddingManager] File not found: {embeddings_path}")
return {}
embeddings = load_embeddings(embeddings_path)
info = {}
for k, v in embeddings.items():
arr = np.array(v)
info[k] = {'shape': arr.shape, 'preview': arr.flatten()[:5].tolist()}
return info
# Delete a specific embedding by key
def delete_embedding(embeddings_path: str, key: str) -> bool:
if not os.path.exists(embeddings_path):
print(f"[EmbeddingManager] File not found: {embeddings_path}")
return False
embeddings = load_embeddings(embeddings_path)
if key not in embeddings:
print(f"[EmbeddingManager] Key not found: {key}")
return False
del embeddings[key]
save_embeddings(embeddings_path, embeddings)
print(f"[EmbeddingManager] Deleted embedding for key: {key}")
return True
def delete_empty_shape_embeddings(embeddings_path: str) -> int:
"""Delete all embeddings whose value is empty (shape==0), shape==(), or content is error string, and return the number deleted."""
if not os.path.exists(embeddings_path):
print(f"[EmbeddingManager] File not found: {embeddings_path}")
return 0
embeddings = load_embeddings(embeddings_path)
to_delete = []
for k, v in embeddings.items():
arr = np.array(v)
# Delete shape==0 or shape==() or content is string/error information
if arr.size == 0 or arr.shape == () or (
isinstance(v, list) and v and isinstance(v[0], str) and v[0].startswith('Error:')
) or (isinstance(v, str) and v.startswith('Error:')):
to_delete.append(k)
for k in to_delete:
del embeddings[k]
print(f"[EmbeddingManager] Deleted empty or error embedding for key: {k}")
if to_delete:
save_embeddings(embeddings_path, embeddings)
return len(to_delete)

View File

@@ -0,0 +1,170 @@
# file_utils.py
import json
import os
import logging
from pathlib import Path
from contextlib import contextmanager
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# ========= File Lock Tools =========
@contextmanager
def locked(path: Path, mode: str):
"""File lock context manager for cross-platform compatibility"""
if os.name == "nt":
# Windows implementation
import msvcrt
import time as _t
# Always use UTF-8 encoding for text files on Windows
if 'b' in mode:
f = open(path, mode)
else:
f = open(path, mode, encoding="utf-8")
try:
while True:
try:
msvcrt.locking(f.fileno(), msvcrt.LK_NBLCK, 1)
break
except OSError:
_t.sleep(0.01)
yield f
finally:
f.seek(0)
msvcrt.locking(f.fileno(), msvcrt.LK_UNLCK, 1)
f.close()
else:
# Unix-like systems implementation
import fcntl
# Always use UTF-8 encoding for text files on Unix-like systems
if 'b' in mode:
f = open(path, mode)
else:
f = open(path, mode, encoding="utf-8")
try:
fcntl.flock(f.fileno(), fcntl.LOCK_EX)
yield f
finally:
fcntl.flock(f.fileno(), fcntl.LOCK_UN)
f.close()
# ========= Safe JSON Operations =========
def safe_json_dump(data: Any, file_handle, **kwargs) -> None:
"""Safely dump JSON data with proper encoding handling"""
kwargs.setdefault('ensure_ascii', False)
kwargs.setdefault('indent', 2)
try:
json.dump(data, file_handle, **kwargs)
except UnicodeEncodeError as e:
logger.warning(f"UnicodeEncodeError during JSON dump: {e}. Falling back to ASCII mode.")
kwargs['ensure_ascii'] = True
json.dump(data, file_handle, **kwargs)
def safe_json_load(file_handle) -> Any:
"""Safely load JSON data with proper encoding handling"""
try:
return json.load(file_handle)
except UnicodeDecodeError as e:
logger.warning(f"UnicodeDecodeError during JSON load: {e}. Attempting recovery.")
file_handle.seek(0)
content = file_handle.read()
# Try common encodings
for encoding in ['utf-8-sig', 'latin1', 'cp1252']:
try:
if isinstance(content, bytes):
decoded_content = content.decode(encoding)
else:
decoded_content = content
return json.loads(decoded_content)
except (UnicodeDecodeError, json.JSONDecodeError):
continue
logger.error("Failed to decode JSON with all attempted encodings. Returning empty data.")
return {}
def safe_write_json(path: Path, data: Any) -> None:
"""Safely write JSON data to file with atomic operation"""
tmp = path.with_suffix(".tmp")
try:
with locked(tmp, "w") as f:
safe_json_dump(data, f)
f.flush()
os.fsync(f.fileno())
tmp.replace(path)
except Exception as e:
logger.error(f"Failed to write JSON to {path}: {e}")
if tmp.exists():
try:
tmp.unlink()
except Exception:
pass
raise
def safe_read_json(path: Path, default: Any = None) -> Any:
"""Safely read JSON data from file"""
try:
with locked(path, "r") as f:
return safe_json_load(f)
except Exception as e:
logger.warning(f"Failed to read JSON from {path}: {e}")
return default if default is not None else []
# ========= Safe Text File Operations =========
def safe_write_text(path: Path, content: str) -> None:
"""Safely write text to file with UTF-8 encoding"""
try:
path.write_text(content, encoding='utf-8')
except UnicodeEncodeError as e:
logger.warning(f"UnicodeEncodeError writing to {path}: {e}. Using error handling.")
path.write_text(content, encoding='utf-8', errors='replace')
def safe_read_text(path: Path) -> str:
"""Safely read text from file with proper encoding handling"""
try:
return path.read_text(encoding='utf-8')
except UnicodeDecodeError as e:
logger.warning(f"UnicodeDecodeError reading {path}: {e}. Trying alternative encodings.")
for encoding in ['utf-8-sig', 'latin1', 'cp1252', 'gbk']:
try:
return path.read_text(encoding=encoding)
except UnicodeDecodeError:
continue
logger.error(f"Failed to decode {path} with all encodings. Using error replacement.")
return path.read_text(encoding='utf-8', errors='replace')
# ========= File Management Utilities =========
def ensure_directory(path: Path) -> None:
"""Ensure directory exists, create if necessary"""
path.mkdir(parents=True, exist_ok=True)
def safe_file_operation(operation_name: str, file_path: Path, operation_func, *args, **kwargs):
"""Generic safe file operation wrapper with error handling"""
try:
return operation_func(*args, **kwargs)
except FileNotFoundError:
logger.error(f"{operation_name}: File not found: {file_path}")
raise
except PermissionError:
logger.error(f"{operation_name}: Permission denied: {file_path}")
raise
except Exception as e:
logger.error(f"{operation_name}: Unexpected error with {file_path}: {e}")
raise
def backup_file(file_path: Path, backup_suffix: str = ".backup") -> Path:
"""Create a backup of a file"""
backup_path = file_path.with_suffix(file_path.suffix + backup_suffix)
try:
if file_path.exists():
import shutil
shutil.copy2(file_path, backup_path)
logger.info(f"Backup created: {backup_path}")
return backup_path
except Exception as e:
logger.error(f"Failed to create backup of {file_path}: {e}")
raise

View File

@@ -0,0 +1,69 @@
# id_utils.py
import uuid
import time
import hashlib
from typing import Optional
# Module-level counter for sequential IDs
_sequential_counter = 1
def generate_uuid() -> str:
"""Generate a random UUID string"""
return str(uuid.uuid4())
def generate_short_id(prefix: str = "", length: int = 8) -> str:
"""Generate a short random ID with optional prefix"""
# Generate UUID and take first N characters
short_uuid = str(uuid.uuid4()).replace("-", "")[:length]
return f"{prefix}{short_uuid}" if prefix else short_uuid
def generate_timestamp_id(prefix: str = "") -> str:
"""Generate ID based on current timestamp"""
timestamp = int(time.time() * 1000) # milliseconds
return f"{prefix}{timestamp}" if prefix else str(timestamp)
def generate_hash_id(content: str, prefix: str = "", length: int = 8) -> str:
"""Generate ID based on content hash"""
hash_obj = hashlib.md5(content.encode('utf-8'))
hash_hex = hash_obj.hexdigest()[:length]
return f"{prefix}{hash_hex}" if prefix else hash_hex
def generate_sequential_id(prefix: str = "", start: int = 1) -> str:
"""Generate sequential ID (not thread-safe, use with caution)"""
global _sequential_counter
if start != 1: # Reset counter if different start value
_sequential_counter = start
current_id = _sequential_counter
_sequential_counter += 1
return f"{prefix}{current_id}" if prefix else str(current_id)
def generate_composite_id(prefix: str = "", include_timestamp: bool = True,
include_uuid: bool = True, separator: str = "_") -> str:
"""Generate composite ID with multiple components"""
parts = []
if prefix:
parts.append(prefix)
if include_timestamp:
parts.append(str(int(time.time() * 1000)))
if include_uuid:
parts.append(str(uuid.uuid4())[:8])
return separator.join(parts)
def validate_id_format(id_string: str, expected_prefix: Optional[str] = None,
min_length: int = 1, max_length: int = 100) -> bool:
"""Validate ID format and constraints"""
if not id_string or not isinstance(id_string, str):
return False
if len(id_string) < min_length or len(id_string) > max_length:
return False
if expected_prefix and not id_string.startswith(expected_prefix):
return False
return True

View File

@@ -0,0 +1,27 @@
from PIL import Image
def pad_to_square(image: Image.Image,
fill_color=(0, 0, 0),
padding: int = 0) -> Image.Image:
"""
First make it a square, then expand the padding pixels around it.
"""
width, height = image.size
if width == height:
square_img = image.copy()
else:
new_size = max(width, height)
square_img = Image.new(image.mode, (new_size, new_size), fill_color)
left = (new_size - width) // 2
top = (new_size - height) // 2
square_img.paste(image, (left, top))
if padding > 0:
final_size = square_img.size[0] + 2 * padding
padded_img = Image.new(square_img.mode, (final_size, final_size),
fill_color)
padded_img.paste(square_img, (padding, padding))
return padded_img
else:
return square_img