add_os_symphony (#399)
This commit is contained in:
0
mm_agents/os_symphony/agents/__init__.py
Executable file
0
mm_agents/os_symphony/agents/__init__.py
Executable file
350
mm_agents/os_symphony/agents/coder_agent.py
Executable file
350
mm_agents/os_symphony/agents/coder_agent.py
Executable file
@@ -0,0 +1,350 @@
|
||||
import logging
|
||||
from typing import Dict, List, Tuple, Optional
|
||||
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, parse_code_from_string
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
|
||||
logger = logging.getLogger("desktopenv.coder_agent")
|
||||
|
||||
|
||||
def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""Extract code and determine type from action string."""
|
||||
if "```python" in action:
|
||||
code_type = "python"
|
||||
code = action.split("```python")[1].split("```")[0].strip()
|
||||
elif "```bash" in action:
|
||||
code_type = "bash"
|
||||
code = action.split("```bash")[1].split("```")[0].strip()
|
||||
elif "```" in action:
|
||||
code_type = None
|
||||
code = action.split("```")[1].split("```")[0].strip()
|
||||
else:
|
||||
code_type = None
|
||||
code = None
|
||||
|
||||
logger.debug(
|
||||
f"Extracted code block: type={code_type}, length={len(code) if code else 0}"
|
||||
)
|
||||
return code_type, code
|
||||
|
||||
|
||||
def execute_code(code_type: str, code: str, env_controller) -> Dict:
|
||||
"""Execute code based on its type."""
|
||||
# Log the full code being executed (untruncated)
|
||||
logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
|
||||
|
||||
try:
|
||||
if code_type == "bash":
|
||||
result = env_controller.run_bash_script(code, timeout=30)
|
||||
elif code_type == "python":
|
||||
result = env_controller.run_python_script(code)
|
||||
else:
|
||||
result = {"status": "error", "error": f"Unknown code type: {code_type}"}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error executing {code_type} code: {e}")
|
||||
return {"status": "error", "error": str(e)}
|
||||
|
||||
|
||||
def format_result(result: Dict, step_count: int) -> str:
|
||||
"""Format execution result into context string."""
|
||||
if not result:
|
||||
logger.warning(f"Step {step_count + 1}: No result returned from execution")
|
||||
return f"""
|
||||
Step {step_count + 1} Error:
|
||||
Error: No result returned from execution
|
||||
"""
|
||||
|
||||
status = result.get("status", "unknown")
|
||||
return_code = result.get("returncode", result.get("return_code", -1))
|
||||
|
||||
# Handle different response structures for bash vs python
|
||||
if "returncode" in result:
|
||||
# Bash script response
|
||||
output = result.get("output", "") # Contains both stdout and stderr merged
|
||||
error = result.get("error", "") # Always empty for bash
|
||||
else:
|
||||
# Python script response
|
||||
output = result.get("output", "") # stdout only
|
||||
error = result.get("error", "") # stderr only
|
||||
|
||||
logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
|
||||
|
||||
# Format with better structure for multi-line outputs
|
||||
result_text = f"Step {step_count + 1} Result:\n"
|
||||
result_text += f"Status: {status}\n"
|
||||
result_text += f"Return Code: {return_code}\n"
|
||||
|
||||
if output:
|
||||
result_text += f"Output:\n{output}\n"
|
||||
|
||||
if error:
|
||||
result_text += f"Error:\n{error}\n"
|
||||
|
||||
return result_text
|
||||
|
||||
|
||||
class CoderAgent:
|
||||
"""A dedicated agent for executing code with a budget of steps."""
|
||||
|
||||
def __init__(self, engine_params: Dict, client_password: str, platform: str = "linux"):
|
||||
"""Initialize the CodeAgent."""
|
||||
if not engine_params:
|
||||
raise ValueError("engine_params cannot be None or empty")
|
||||
|
||||
self.engine_params = engine_params
|
||||
self.budget = engine_params.get("budget", 20)
|
||||
self.temperature = engine_params.get("temperature", 0.1)
|
||||
self.agent = None
|
||||
self.platform = platform
|
||||
self.client_password = client_password
|
||||
|
||||
logger.info(f"CodeAgent initialized with budget={self.budget} and platform={self.platform}")
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
"""Reset the code agent state."""
|
||||
logger.debug("Resetting CodeAgent state")
|
||||
self.agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.construct_coder_procedural_memory(platform=self.platform, client_password=self.client_password)
|
||||
)
|
||||
|
||||
def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
|
||||
"""Execute code for the given task with a budget of steps."""
|
||||
if env_controller is None:
|
||||
raise ValueError("env_controller is required for code execution")
|
||||
|
||||
print(f"\n🚀 STARTING CODE EXECUTION")
|
||||
print("=" * 60)
|
||||
print(f"Task: {task_instruction}")
|
||||
print(f"Budget: {self.budget} steps")
|
||||
print("=" * 60)
|
||||
|
||||
logger.info(f"Starting code execution for task: {task_instruction}")
|
||||
logger.info(f"Budget: {self.budget} steps")
|
||||
|
||||
self.reset()
|
||||
|
||||
|
||||
# Add initial task instruction and screenshot context as user message
|
||||
context = (
|
||||
f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
|
||||
)
|
||||
self.agent.add_message(context, image_content=screenshot, role="user")
|
||||
|
||||
step_count = 0
|
||||
execution_history = []
|
||||
execution_result_history = []
|
||||
while step_count < self.budget:
|
||||
logger.info(f"Step {step_count + 1}/{self.budget}")
|
||||
|
||||
# Get assistant response (thoughts and code)
|
||||
response = call_llm_safe(self.agent, temperature=self.temperature)
|
||||
|
||||
# Print to terminal for immediate visibility
|
||||
# print(f"\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}")
|
||||
# print("=" * 60)
|
||||
# print(response)
|
||||
# print("=" * 60)
|
||||
|
||||
# Log the latest message from the coding agent (untruncated)
|
||||
logger.info(
|
||||
f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}"
|
||||
)
|
||||
|
||||
# Check if response is None or empty
|
||||
if not response or response.strip() == "":
|
||||
error_msg = f"Step {step_count + 1}: LLM returned empty response"
|
||||
logger.error(error_msg)
|
||||
raise RuntimeError(error_msg)
|
||||
|
||||
# Parse the response to extract action
|
||||
action = parse_code_from_string(response)
|
||||
thoughts = response
|
||||
|
||||
execution_history.append(
|
||||
{"step": step_count + 1, "action": action, "thoughts": thoughts}
|
||||
)
|
||||
|
||||
# Check for completion signals
|
||||
action_upper = action.upper().strip()
|
||||
if action_upper == "DONE":
|
||||
print(f"\n✅ TASK COMPLETED - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task completion")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task completed successfully")
|
||||
completion_reason = "DONE"
|
||||
break
|
||||
elif action_upper == "FAIL":
|
||||
print(f"\n❌ TASK FAILED - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task failure")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task failed by agent request")
|
||||
completion_reason = "FAIL"
|
||||
break
|
||||
elif action_upper == 'INFEASIBLE':
|
||||
print(f"\n❌ TASK INFEASIBLE - Step {step_count + 1}")
|
||||
print("=" * 60)
|
||||
print("Agent signaled task infeasible")
|
||||
print("=" * 60)
|
||||
logger.info(f"Step {step_count + 1}: Task infeasible by agent request")
|
||||
completion_reason = "INFEASIBLE"
|
||||
break
|
||||
|
||||
# Extract and execute code
|
||||
code_type, code = extract_code_block(response.split("(Answer)")[-1])
|
||||
|
||||
if code:
|
||||
result = execute_code(code_type, code, env_controller)
|
||||
execution_result_history.append(
|
||||
{"step": step_count + 1, "result": result}
|
||||
)
|
||||
# Prepare formatted output and error for logging
|
||||
output = result.get("output", "")
|
||||
error = result.get("error", "")
|
||||
message = result.get("message", "")
|
||||
status = result.get("status", "")
|
||||
|
||||
# Print execution result to terminal for immediate visibility
|
||||
print(f"\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}")
|
||||
print("-" * 50)
|
||||
print(f"Status: {status}")
|
||||
if output:
|
||||
print(f"Output:\n{output}")
|
||||
if error:
|
||||
print(f"Error:\n{error}")
|
||||
if message and not output and not error:
|
||||
print(f"Message:\n{message}")
|
||||
print("-" * 50)
|
||||
|
||||
log_lines = [
|
||||
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
|
||||
f"Status: {status}" if status else None,
|
||||
]
|
||||
|
||||
if output:
|
||||
log_lines.append(
|
||||
"Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40)
|
||||
)
|
||||
if error:
|
||||
log_lines.append(
|
||||
"Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40)
|
||||
)
|
||||
if message and not output and not error:
|
||||
log_lines.append(
|
||||
"Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40)
|
||||
)
|
||||
|
||||
# Remove None entries and join
|
||||
formatted_log = "\n".join([line for line in log_lines if line])
|
||||
logger.info(formatted_log)
|
||||
else:
|
||||
print(f"\n⚠️ NO CODE BLOCK FOUND - Step {step_count + 1}")
|
||||
print("-" * 50)
|
||||
print("Action did not contain executable code")
|
||||
print("-" * 50)
|
||||
|
||||
logger.warning(f"Step {step_count + 1}: No code block found in action")
|
||||
result = {"status": "skipped", "message": "No code block found"}
|
||||
logger.info(
|
||||
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
|
||||
f"Status: skipped\n"
|
||||
f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
|
||||
)
|
||||
# Add assistant's thoughts and code to message history
|
||||
self.agent.add_message(response, role="assistant")
|
||||
|
||||
# Process result and add formatted environment results as user message
|
||||
result_context = format_result(result, step_count)
|
||||
self.agent.add_message(result_context, role="user")
|
||||
|
||||
step_count += 1
|
||||
|
||||
# Handle budget exhaustion
|
||||
if "completion_reason" not in locals():
|
||||
print(f"\n⏰ BUDGET EXHAUSTED - {step_count} steps completed")
|
||||
print("=" * 60)
|
||||
print(f"Maximum budget of {self.budget} steps reached")
|
||||
print("=" * 60)
|
||||
logger.info(f"Budget exhausted after {step_count} steps")
|
||||
completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
|
||||
|
||||
# Generate final summary
|
||||
logger.info("Generating execution summary")
|
||||
summary = self._generate_summary(execution_history, task_instruction)
|
||||
|
||||
result = {
|
||||
"task_instruction": task_instruction,
|
||||
"completion_reason": completion_reason,
|
||||
"summary": summary,
|
||||
"execution_history": execution_history,
|
||||
"execution_result_history": execution_result_history,
|
||||
"steps_executed": step_count,
|
||||
"budget": self.budget
|
||||
}
|
||||
|
||||
logger.info(f"Code execution completed: steps={step_count}")
|
||||
return result
|
||||
|
||||
def _generate_summary(
|
||||
self, execution_history: List[Dict], task_instruction: str
|
||||
) -> str:
|
||||
"""Generate summary of code execution session."""
|
||||
if not execution_history:
|
||||
logger.info("No execution history to summarize")
|
||||
return "No actions were executed."
|
||||
|
||||
logger.info(f"Generated summary for {len(execution_history)} steps")
|
||||
|
||||
# Build detailed execution context for summary agent
|
||||
execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
|
||||
|
||||
for step in execution_history:
|
||||
step_num = step["step"]
|
||||
thoughts = step.get("thoughts", "")
|
||||
action = step.get("action", "")
|
||||
|
||||
execution_context += f"\nStep {step_num}:\n"
|
||||
if thoughts:
|
||||
execution_context += f"Thoughts: {thoughts}\n"
|
||||
execution_context += f"Code: {action}\n"
|
||||
|
||||
# Create summary prompt with same context as coding agent
|
||||
summary_prompt = f"""
|
||||
{execution_context}
|
||||
|
||||
Please provide a concise summary of the code execution session. Focus on:
|
||||
|
||||
1. The code logic implemented at each step
|
||||
2. The outputs and results produced by each code execution
|
||||
3. The progression of the solution approach
|
||||
|
||||
Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
|
||||
|
||||
Keep the summary under 150 words and use clear, factual language.
|
||||
"""
|
||||
|
||||
# Generate summary using LLM with dedicated summary system prompt
|
||||
try:
|
||||
summary_agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,
|
||||
)
|
||||
summary_agent.add_message(summary_prompt, role="user")
|
||||
summary = call_llm_safe(summary_agent, temperature=self.temperature)
|
||||
|
||||
if not summary or summary.strip() == "":
|
||||
summary = "Summary generation failed - no response from LLM"
|
||||
logger.warning("Summary generation failed - empty response from LLM")
|
||||
|
||||
except Exception as e:
|
||||
summary = f"Summary generation failed: {str(e)}"
|
||||
logger.error(f"Error generating summary: {e}")
|
||||
|
||||
return summary
|
||||
109
mm_agents/os_symphony/agents/grounder_agent.py
Executable file
109
mm_agents/os_symphony/agents/grounder_agent.py
Executable file
@@ -0,0 +1,109 @@
|
||||
import re
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import pytesseract
|
||||
from PIL import Image
|
||||
import io
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, smart_resize
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
class GrounderAgent:
|
||||
"""
|
||||
Class designed for interacting with GUI, serving for Grounding Agent and VLMSearcher
|
||||
"""
|
||||
def __init__(self, engine_params: Dict, screen_width: int, screen_height: int):
|
||||
self.engine_params_for_grounder = engine_params # grounder_params
|
||||
system_prompt, self.user_message = PROCEDURAL_MEMORY.construct_grounder_procedural_memory(model_name=engine_params["model"])
|
||||
self.grounding_model = LMMAgent(engine_params, system_prompt=system_prompt)
|
||||
# Width and height for Grounding Agent!
|
||||
self.width = engine_params['grounding_width']
|
||||
self.height = engine_params['grounding_height']
|
||||
print(f"[Grounder]: initialized width is {self.width}, height is {self.height}")
|
||||
# Width and height for actual screen!
|
||||
self.screen_width = screen_width
|
||||
self.screen_height = screen_height
|
||||
|
||||
# Given the state and worker's referring expression, use the grounding model to generate (x,y)
|
||||
def generate_coords(self, ref_expr: str, obs: Dict, detail=False, expansion_pixels=400, **kwargs) -> List:
|
||||
cur_screenshot = obs["screenshot"]
|
||||
|
||||
# store global offset
|
||||
global_offset_x = 0
|
||||
global_offset_y = 0
|
||||
|
||||
# final coordinates for output
|
||||
final_global_x = 0
|
||||
final_global_y = 0
|
||||
|
||||
cur_width, cur_height = self.screen_width, self.screen_height
|
||||
|
||||
print(f"[Grounder] start to ground!")
|
||||
self.grounding_model.reset()
|
||||
|
||||
# Configure the context
|
||||
prompt = self.user_message.replace("REF_EXPR", ref_expr)
|
||||
|
||||
# cosistent with the system prompt presented in the paper of GTA-1
|
||||
if 'gta' in self.engine_params_for_grounder['model']:
|
||||
self.grounding_model.add_system_prompt("You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.")
|
||||
|
||||
self.grounding_model.add_message(
|
||||
text_content=prompt, image_content=cur_screenshot, put_text_last=True, role="user"
|
||||
)
|
||||
|
||||
# Generate and parse coordinates
|
||||
response = call_llm_safe(self.grounding_model, temperature=0.05, **kwargs)
|
||||
print(f"[Grounder] prompt: {prompt}\nmodel: {self.engine_params_for_grounder['model']}, \nresponse: {response}")
|
||||
|
||||
|
||||
# 1. highest priority: (x1="...", y1="...", x="...", y="...")
|
||||
numericals = re.findall(r'(?:x1|y1|x|y)=["\']?(\d+)["\']?', response)
|
||||
# 2. second highest priority: just like <points>653 42</points> or [653, 42]
|
||||
if len(numericals) < 2:
|
||||
clean_response = re.sub(r'[xXyY]\d', '', response)
|
||||
numericals = re.findall(r'\d+', clean_response)
|
||||
assert len(numericals) >= 2
|
||||
|
||||
print(f"[Grounder] the parsed coordinates: {numericals}")
|
||||
|
||||
local_x, local_y = self._resize_coordinates([int(numericals[0]), int(numericals[1])], width=cur_width, height=cur_height)
|
||||
|
||||
# current global coordinates = local ordinates + global offset
|
||||
final_global_x = local_x + global_offset_x
|
||||
final_global_y = local_y + global_offset_y
|
||||
|
||||
if detail:
|
||||
return [cur_screenshot, global_offset_x, global_offset_y]
|
||||
else:
|
||||
return [final_global_x, final_global_y]
|
||||
|
||||
def dynamic_set_width_height(self, width: int, height: int):
|
||||
self.width = width
|
||||
self.height = height
|
||||
|
||||
# Resize from grounding model dim into OSWorld dim (1920 * 1080)
|
||||
def _resize_coordinates(self, coordinates: List[int], width:int, height:int) -> List[int]:
|
||||
"""
|
||||
width, height: for current observation
|
||||
grounding_width, grounding_height: width and height for Grounding model 1000x1000 or 1280x800)
|
||||
"""
|
||||
grounding_width = self.engine_params_for_grounder["grounding_width"]
|
||||
grounding_height = self.engine_params_for_grounder["grounding_height"]
|
||||
grounding_smart_resize = self.engine_params_for_grounder["grounding_smart_resize"]
|
||||
|
||||
|
||||
if not grounding_smart_resize:
|
||||
return [
|
||||
round(coordinates[0] * width / grounding_width),
|
||||
round(coordinates[1] * height / grounding_height),
|
||||
]
|
||||
else:
|
||||
smart_height, smart_width = smart_resize(height, width)
|
||||
return [
|
||||
round(coordinates[0] * width / smart_width),
|
||||
round(coordinates[1] * height / smart_height)
|
||||
]
|
||||
428
mm_agents/os_symphony/agents/memoryer_agent.py
Executable file
428
mm_agents/os_symphony/agents/memoryer_agent.py
Executable file
@@ -0,0 +1,428 @@
|
||||
from ast import parse
|
||||
import logging
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
from mm_agents.os_symphony.utils.common_utils import (
|
||||
call_llm_formatted,
|
||||
enhance_observation,
|
||||
parse_code_from_string
|
||||
)
|
||||
from functools import partial
|
||||
from mm_agents.os_symphony.utils.formatters import JSON_ANSWER_FORMATTER
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
import imagehash
|
||||
import io
|
||||
import os
|
||||
from PIL import Image
|
||||
import numpy as np
|
||||
from skimage.metrics import structural_similarity as ssim
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class StepBehavior:
|
||||
"""
|
||||
Narrative Step Behavior.
|
||||
Description of each step, cosists of generative agent (main agent)'s output, screenshot (if this step is milestone), and textual description.
|
||||
The textual description shows that how the agent thought and did, and how the state changes.
|
||||
"""
|
||||
def __init__(self, is_milestone: bool, gen_output: str, summary: str, obs: Dict, action_dict: Dict):
|
||||
self.is_milestone = is_milestone
|
||||
self.gen_output = gen_output
|
||||
self.obs = obs
|
||||
self.summary = summary
|
||||
self.action_dict = action_dict
|
||||
# Variants for opyimizing the time complexity of loop detection
|
||||
# --- 1. pHash ---
|
||||
self.phash = None
|
||||
# --- 2. SSIM ---
|
||||
self.ssim_list = []
|
||||
|
||||
def _update_phash_ssim(self, history: List):
|
||||
# Calculate the ssim_list of current obs
|
||||
# Update pHash
|
||||
cur_img = Image.open(io.BytesIO(self.obs["screenshot"]))
|
||||
cur_img_gray = cur_img.convert('L')
|
||||
cur_img_np = np.array(cur_img_gray)
|
||||
self.phash = imagehash.phash(cur_img)
|
||||
# Update ssim_list
|
||||
for hs in history:
|
||||
compare_img = Image.open(io.BytesIO(hs.obs["screenshot"]))
|
||||
compare_img_gray = compare_img.convert('L')
|
||||
compare_img_np = np.array(compare_img_gray)
|
||||
self.ssim_list.append(ssim(cur_img_np, compare_img_np, data_range=cur_img_np.max() - compare_img_np.min()))
|
||||
|
||||
class ReflectionMemoryAgent:
|
||||
"""
|
||||
Reflection Memory Agent (RMA).
|
||||
Responsible for maintaining long-term memory, extracting narratives from trajectories,
|
||||
providing reflections to the Main Agent, and validating task completion status.
|
||||
"""
|
||||
def __init__(self, engine_params: Dict):
|
||||
"""
|
||||
Initialize the RMA.
|
||||
|
||||
Args:
|
||||
- engine_params:
|
||||
{
|
||||
"engine_type": args.provider,
|
||||
"model": args.model,
|
||||
"base_url": args.model_url,
|
||||
"api_key": args.model_api_key,
|
||||
"temperature": getattr(args, "model_temperature", None),
|
||||
}
|
||||
- max_img_len: max image number to use in reflection process
|
||||
"""
|
||||
|
||||
self.engine_params = engine_params
|
||||
|
||||
self.max_images = engine_params.get('max_images', 8)
|
||||
|
||||
self.memoryer_level = engine_params.get('memoryer_level', 3)
|
||||
|
||||
self.reset()
|
||||
|
||||
logger.info(f"ReflectionMemoryAgent initialized with:\n {self.engine_params}")
|
||||
|
||||
|
||||
def reset(self):
|
||||
"""Reset the code agent state."""
|
||||
logger.debug("Resetting RMA state")
|
||||
|
||||
self.instruction = None
|
||||
|
||||
self.trajectory: List[StepBehavior] = []
|
||||
|
||||
self.knowledge_base: List[str] = []
|
||||
|
||||
self.last_code_step_idx = -1
|
||||
|
||||
'''
|
||||
Control the count of images, we only use the maximum number of max_img_len images.
|
||||
The update logic: the 0-th screenshot is always retained. If the total number of screenshots is less than max_img_len, all are kept; otherwise, starting from index 1, milestone screenshots are managed via FIFO.
|
||||
'''
|
||||
self.active_img_idx = []
|
||||
|
||||
self.reflection_agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT,
|
||||
)
|
||||
self.behavior_agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=PROCEDURAL_MEMORY.SUMMARIZE_STEP_SYSTEM_PROMPT
|
||||
)
|
||||
|
||||
def add_instruction(self, instruction):
|
||||
"""
|
||||
[Interface] Main -> RMA
|
||||
Main agent set the instruction to RMA.
|
||||
"""
|
||||
self.instruction = instruction
|
||||
|
||||
def _update_trajectory(self, step_behavior):
|
||||
self.trajectory.append(step_behavior)
|
||||
if len(self.active_img_idx) >= self.max_images:
|
||||
if step_behavior.is_milestone:
|
||||
self.active_img_idx.append(len(self.trajectory) - 1) # over max_img_len,only milestone image
|
||||
del self.active_img_idx[1] # FIFO starts from index 1
|
||||
else:
|
||||
self.active_img_idx.append(len(self.trajectory) - 1) # less than max_img_len, feed all images
|
||||
|
||||
assert len(self.active_img_idx) <= self.max_images, "[RMA] Errors in updating StepBehavior!!"
|
||||
|
||||
def _summarize_step_behavior(
|
||||
self,
|
||||
generator_output: str,
|
||||
cur_obs: Dict,
|
||||
enhanced_obs: bytes | None,
|
||||
is_milestone: bool,
|
||||
mode: str = "gui",
|
||||
code_exec_summary: str = "",
|
||||
action_dict: Dict = {}
|
||||
) -> Tuple[StepBehavior, bool]:
|
||||
"""
|
||||
[Interface] Main -> RMA
|
||||
The Main Agent (MA) calls this method to "feed" the information of the just-completed step to the RMA.
|
||||
RMA will internally process and store this step.
|
||||
"""
|
||||
|
||||
if mode == "search":
|
||||
is_success = "successful"
|
||||
# summary is fixed
|
||||
step_behavior = StepBehavior(
|
||||
False,
|
||||
generator_output,
|
||||
"Search Agent was called last step, and a tutorial has been generated.",
|
||||
cur_obs,
|
||||
action_dict
|
||||
)
|
||||
elif mode == "code":
|
||||
self.last_code_step_idx = len(self.trajectory)
|
||||
|
||||
is_success = "successful"
|
||||
# the summary returned by the code agent
|
||||
step_behavior = StepBehavior(
|
||||
False,
|
||||
generator_output,
|
||||
f"Code Agent was called last step, and the summary of its trajectory is: \n---\n{code_exec_summary}\n---",
|
||||
cur_obs,
|
||||
action_dict
|
||||
)
|
||||
else: # common gui operation, use LLM to summarize
|
||||
prev_obs = self.trajectory[-1].obs
|
||||
|
||||
text_content = f"""Computer Use Agent's Output: \n{generator_output}"""
|
||||
|
||||
|
||||
self.behavior_agent.reset() # don't need history messages
|
||||
|
||||
updated_sys_prompt = (
|
||||
self.behavior_agent.system_prompt + "\n" + text_content
|
||||
)
|
||||
self.behavior_agent.add_system_prompt(updated_sys_prompt)
|
||||
|
||||
self.behavior_agent.add_message(
|
||||
text_content="This is the observation before executing action (attached below).",
|
||||
image_content=prev_obs['screenshot'],
|
||||
role="user",
|
||||
put_text_last=False
|
||||
)
|
||||
self.behavior_agent.add_message(
|
||||
text_content="This is the zoom-in view, which may help you to identify the operational region (attached below).",
|
||||
image_content=enhanced_obs,
|
||||
role="user",
|
||||
put_text_last=False
|
||||
)
|
||||
self.behavior_agent.add_message(
|
||||
text_content="This is the observation after executing action (attached below).",
|
||||
image_content=cur_obs['screenshot'],
|
||||
role="user",
|
||||
put_text_last=False
|
||||
)
|
||||
|
||||
required_fields = ["summary", "evaluation"]
|
||||
format_checkers = [
|
||||
partial(JSON_ANSWER_FORMATTER, required_fields)
|
||||
]
|
||||
|
||||
full_response = call_llm_formatted(
|
||||
self.behavior_agent,
|
||||
format_checkers,
|
||||
temperature=self.engine_params.get("temperture", 0.1),
|
||||
)
|
||||
|
||||
response = parse_code_from_string(full_response)
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
behavior_summary = data['summary']
|
||||
is_success = data["evaluation"]
|
||||
except Exception as e:
|
||||
print("[RMA] Errors in generating step summary: ", e)
|
||||
logger.info("Response is not a JSON object or miss required keys!")
|
||||
behavior_summary = response
|
||||
is_success = "successful"
|
||||
|
||||
|
||||
step_behavior = StepBehavior(is_milestone, generator_output, behavior_summary, cur_obs, action_dict)
|
||||
|
||||
return step_behavior, is_success == "successful"
|
||||
|
||||
def get_reflection(
|
||||
self,
|
||||
cur_obs: Dict,
|
||||
generator_output: str,
|
||||
coordinates: List,
|
||||
mode: str="gui",
|
||||
code_exec_summary: str = "",
|
||||
action_dict: Dict = {}
|
||||
) -> Dict:
|
||||
"""
|
||||
[Interface] RMA -> Main
|
||||
The Main Agent (MA) calls this method to get RMA's reflection before deciding the next action.
|
||||
|
||||
Args:
|
||||
- cur_obs (Dict): The Main Agent's current observation (o_k).
|
||||
- generator_output (str): The thoughts, screen analysis and action of Main Agent.
|
||||
- coordinates (List): coordinates in the last operation step of Main Agent.
|
||||
- mode(str): [gui, code, search]. Indicate which agent that main agent called last step.
|
||||
- code_exec_summary: execution summary for code agent.
|
||||
- action_dict: extracted action from generator output.
|
||||
|
||||
Returns:
|
||||
- reflection_info(Dict): all the info related to reflection
|
||||
"""
|
||||
if self.memoryer_level == 0:
|
||||
return {
|
||||
"reflection": None,
|
||||
"reflection_thoughts": None,
|
||||
"existing_knowledge": None,
|
||||
"is_milestone": False,
|
||||
"new_knowledge": None,
|
||||
"step_summary": None,
|
||||
"hint": {
|
||||
"gui_operation_error": False,
|
||||
"lack_of_tutorial": False,
|
||||
"code_error": False,
|
||||
"loop_detection": None,
|
||||
}
|
||||
}
|
||||
|
||||
reflection = None
|
||||
reflection_thought = None
|
||||
if len(self.trajectory) == 0:
|
||||
step_behavior = StepBehavior(
|
||||
True,
|
||||
"The initial screen is provided. No action has been taken yet.",
|
||||
"The initial screen is provided. No action has been taken yet.",
|
||||
cur_obs,
|
||||
action_dict
|
||||
)
|
||||
step_behavior._update_phash_ssim(self.trajectory)
|
||||
self._update_trajectory(step_behavior)
|
||||
reflection_info = {
|
||||
"reflection": reflection,
|
||||
"reflection_thoughts": reflection_thought,
|
||||
"existing_knowledge": "\n".join(self.knowledge_base),
|
||||
"is_milestone": True,
|
||||
"new_knowledge": "",
|
||||
"step_summary": "",
|
||||
"loop_detection": None
|
||||
}
|
||||
else:
|
||||
### Step Summary
|
||||
prev_obs = self.trajectory[-1].obs
|
||||
enhanced_obs = None
|
||||
if coordinates:
|
||||
enhanced_obs, _, _, _, _ = enhance_observation(
|
||||
prev_obs["screenshot"],
|
||||
coordinates,
|
||||
draw=True
|
||||
)
|
||||
|
||||
# generate step behavior
|
||||
step_behavior, last_gui_check = self._summarize_step_behavior(
|
||||
generator_output,
|
||||
cur_obs,
|
||||
enhanced_obs,
|
||||
False,
|
||||
mode,
|
||||
code_exec_summary,
|
||||
action_dict
|
||||
)
|
||||
step_behavior._update_phash_ssim(self.trajectory)
|
||||
|
||||
### make additional hints
|
||||
additional_hints = []
|
||||
if not last_gui_check:
|
||||
additional_hints.append(f"\t- Warning: The last GUI operation is unsuccessful. Careful review is required to avoid GUI Operation Error.")
|
||||
|
||||
code_error_hint = False
|
||||
|
||||
if self.last_code_step_idx != -1 and len(self.trajectory) - self.last_code_step_idx < 0:
|
||||
code_error_hint = True
|
||||
additional_hints.append(f"\t- Warning: The Computer Use Agent might in the verification stage of Code Agent. Careful review is required to avoid Code Error.")
|
||||
|
||||
# loop detection
|
||||
from mm_agents.os_symphony.utils.loop_detection import detect_loop
|
||||
is_loop, loop_details = detect_loop(full_trajectory=self.trajectory, N=3)
|
||||
if is_loop and loop_details:
|
||||
match_sequence_indices = loop_details["match_sequence_indices"]
|
||||
loop_hint_message = f"\t- Warning: A potential LOOP has been detected between Step {match_sequence_indices[0]} and Step {match_sequence_indices[-1]}. Careful review is required to avoid Repetitive Behavior Error."
|
||||
additional_hints.append(loop_hint_message)
|
||||
|
||||
self.reflection_agent.reset()
|
||||
|
||||
updated_sys_prompt = (
|
||||
PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT + "\n\n" +
|
||||
f"---\n- **user instruction**: {self.instruction}\n" +
|
||||
"- **existing knowledge**: \n" + "\n".join(self.knowledge_base) +
|
||||
"\n- **additional_hints**: " + "\n".join(additional_hints) + "\n---"
|
||||
)
|
||||
|
||||
# update system prompt
|
||||
self.reflection_agent.add_system_prompt(updated_sys_prompt)
|
||||
|
||||
|
||||
for i, step in enumerate(self.trajectory):
|
||||
text_content = f"""### (Step {i}) history:\nsummary: '''\n{step.summary}\n'''"""
|
||||
if i in self.active_img_idx:
|
||||
if i == 0:
|
||||
text_content += f"\ninitial screenshot:"
|
||||
else:
|
||||
text_content += f"\nscreenshot (after executing action): (attached below)"
|
||||
|
||||
self.reflection_agent.add_message(
|
||||
text_content=text_content,
|
||||
image_content=step.obs['screenshot'] if i in self.active_img_idx else None,
|
||||
role="user",
|
||||
put_text_last=False
|
||||
)
|
||||
|
||||
text_content = f"""### (Last Step) CUA's output (has been finished):\n---\n{generator_output}\n---\nStep Summary:\n---\n{step_behavior.summary}\n---\nlatest_screenshot: (attached below)"""
|
||||
self.reflection_agent.add_message(
|
||||
text_content=text_content,
|
||||
image_content=cur_obs['screenshot'],
|
||||
role="user",
|
||||
put_text_last=False
|
||||
)
|
||||
|
||||
required_fields = ["is_milestone", "reflection", "knowledge"]
|
||||
|
||||
format_checkers = [
|
||||
partial(JSON_ANSWER_FORMATTER, required_fields)
|
||||
]
|
||||
|
||||
full_response = call_llm_formatted(
|
||||
self.reflection_agent,
|
||||
format_checkers
|
||||
)
|
||||
|
||||
|
||||
reflection_thought = full_response
|
||||
|
||||
response = parse_code_from_string(full_response)
|
||||
|
||||
try:
|
||||
data = json.loads(response)
|
||||
reflection = data['reflection']
|
||||
is_milestone = data["is_milestone"]
|
||||
knowledge = data['knowledge']
|
||||
except Exception as e:
|
||||
print("[RMA] Errors in dealing with reflection: ", e)
|
||||
logger.info("Response is not a JSON object or miss required keys!")
|
||||
reflection = response
|
||||
is_milestone = False
|
||||
knowledge = ""
|
||||
|
||||
if len(knowledge) > 0:
|
||||
self.knowledge_base.append(knowledge)
|
||||
|
||||
if isinstance(is_milestone, str):
|
||||
is_milestone = True if "true" in is_milestone.lower() else False
|
||||
|
||||
# update trajectory and is_milestone
|
||||
self._update_trajectory(step_behavior)
|
||||
if mode == "gui": # only gui opration can be considered as milestone
|
||||
self.trajectory[-1].is_milestone = is_milestone
|
||||
|
||||
|
||||
reflection_info = {
|
||||
"reflection": reflection,
|
||||
"reflection_thoughts": reflection_thought,
|
||||
"existing_knowledge": "\n".join(self.knowledge_base),
|
||||
"is_milestone": is_milestone,
|
||||
"new_knowledge": knowledge,
|
||||
"step_summary": step_behavior.summary,
|
||||
"hint": {
|
||||
"gui_operation_error": not last_gui_check,
|
||||
"lack_of_tutorial": is_loop,
|
||||
"code_error": code_error_hint,
|
||||
"loop_detection": loop_details,
|
||||
}
|
||||
}
|
||||
|
||||
return reflection_info
|
||||
|
||||
|
||||
178
mm_agents/os_symphony/agents/ocr.py
Executable file
178
mm_agents/os_symphony/agents/ocr.py
Executable file
@@ -0,0 +1,178 @@
|
||||
import re
|
||||
from io import BytesIO
|
||||
from typing import Tuple, List, Dict
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
import numpy as np
|
||||
import pytesseract
|
||||
from pytesseract import Output
|
||||
import easyocr
|
||||
|
||||
|
||||
class OCRProcessor:
|
||||
"""
|
||||
OCR Processor supports Tesseract and EasyOCR
|
||||
"""
|
||||
def __init__(self, use_gpu: bool = False, languages: List[str] = ['en']):
|
||||
"""
|
||||
Initialize processor
|
||||
|
||||
Args:
|
||||
use_gpu (bool): whether EasyOCR need to use gpu
|
||||
languages (List[str]): language list that EasyOCR, e.g. ['en', 'ch_sim']。
|
||||
"""
|
||||
self.use_gpu = use_gpu
|
||||
self.languages = languages
|
||||
self.reader = None # lazy-load EasyOCR Reader
|
||||
|
||||
def _get_easyocr_reader(self):
|
||||
if self.reader is None:
|
||||
print(f"Loading EasyOCR model (GPU={self.use_gpu})...")
|
||||
self.reader = easyocr.Reader(self.languages, gpu=self.use_gpu)
|
||||
return self.reader
|
||||
|
||||
def get_ocr_elements(self, bytes_image_data: bytes, mode: str = 'tesseract') -> Tuple[str, List[Dict]]:
|
||||
"""
|
||||
Executes OCR recognization.
|
||||
|
||||
Args:
|
||||
bytes_image_data (str): image in Base64
|
||||
mode (str): 'tesseract' (faster) or 'easyocr' (more precise)。
|
||||
|
||||
Returns:
|
||||
Tuple[str, List]: (textual table string, list of element details)
|
||||
"""
|
||||
try:
|
||||
image = Image.open(BytesIO(bytes_image_data))
|
||||
except Exception as e:
|
||||
print(f"Error decoding or opening image: {e}")
|
||||
return "", []
|
||||
|
||||
if mode == 'tesseract':
|
||||
return self._process_tesseract(image)
|
||||
elif mode == 'easyocr':
|
||||
return self._process_easyocr(image)
|
||||
else:
|
||||
raise ValueError(f"Unknown mode: {mode}. Use 'tesseract' or 'easyocr'.")
|
||||
|
||||
def _process_tesseract(self, image: Image.Image) -> Tuple[str, List[Dict]]:
|
||||
"""Tesseract processing"""
|
||||
data = pytesseract.image_to_data(image, output_type=Output.DICT)
|
||||
|
||||
ocr_elements = []
|
||||
ocr_table = "Text Table (Tesseract):\nWord id\tText\n"
|
||||
ocr_id = 0
|
||||
|
||||
num_boxes = len(data['text'])
|
||||
for i in range(num_boxes):
|
||||
# filter text with low confidence
|
||||
if int(data['conf'][i]) > 0 and data['text'][i].strip():
|
||||
clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", data['text'][i])
|
||||
if not clean_text: continue
|
||||
|
||||
ocr_table += f"{ocr_id}\t{clean_text}\n"
|
||||
|
||||
ocr_elements.append({
|
||||
"id": ocr_id,
|
||||
"text": clean_text,
|
||||
"mode": "tesseract",
|
||||
"left": data["left"][i],
|
||||
"top": data["top"][i],
|
||||
"width": data["width"][i],
|
||||
"height": data["height"][i],
|
||||
"conf": data["conf"][i]
|
||||
})
|
||||
ocr_id += 1
|
||||
|
||||
return ocr_table, ocr_elements
|
||||
|
||||
def _process_easyocr(self, image: Image.Image) -> Tuple[str, List[Dict]]:
|
||||
"""EasyOCR processing"""
|
||||
reader = self._get_easyocr_reader()
|
||||
|
||||
image_np = np.array(image)
|
||||
|
||||
# detail=1 means returning (bbox, text, conf)
|
||||
results = reader.readtext(image_np, detail=1, paragraph=False, width_ths=0.1)
|
||||
|
||||
ocr_elements = []
|
||||
ocr_table = "Text Table (EasyOCR):\nWord id\tText\n"
|
||||
ocr_id = 0
|
||||
|
||||
for (bbox, text, conf) in results:
|
||||
clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", text)
|
||||
if not clean_text.strip(): continue
|
||||
|
||||
# EasyOCR returns [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
|
||||
# we convert them into left, top, width, height
|
||||
(tl, tr, br, bl) = bbox
|
||||
tl = [int(v) for v in tl]
|
||||
br = [int(v) for v in br]
|
||||
|
||||
left = min(tl[0], bl[0])
|
||||
top = min(tl[1], tr[1])
|
||||
right = max(tr[0], br[0])
|
||||
bottom = max(bl[1], br[1])
|
||||
|
||||
width = right - left
|
||||
height = bottom - top
|
||||
# ---------------
|
||||
|
||||
ocr_table += f"{ocr_id}\t{clean_text}\n"
|
||||
|
||||
ocr_elements.append({
|
||||
"id": ocr_id,
|
||||
"text": clean_text,
|
||||
"mode": "easyocr",
|
||||
"left": left,
|
||||
"top": top,
|
||||
"width": width,
|
||||
"height": height,
|
||||
"conf": float(conf)
|
||||
})
|
||||
ocr_id += 1
|
||||
|
||||
return ocr_table, ocr_elements
|
||||
|
||||
@staticmethod
|
||||
def visualize_ocr_results(image_path: str, ocr_elements: List[Dict], output_path: str):
|
||||
"""
|
||||
Draw bounding boxes and IDs on the original image.
|
||||
"""
|
||||
try:
|
||||
image = Image.open(image_path).convert("RGB")
|
||||
draw = ImageDraw.Draw(image)
|
||||
|
||||
try:
|
||||
font = ImageFont.truetype("arial.ttf", 16)
|
||||
except IOError:
|
||||
font = ImageFont.load_default()
|
||||
|
||||
for element in ocr_elements:
|
||||
left, top = element["left"], element["top"]
|
||||
width, height = element["width"], element["height"]
|
||||
|
||||
color = "green" if element.get("mode") == "easyocr" else "red"
|
||||
|
||||
draw.rectangle([(left, top), (left + width, top + height)], outline=color, width=2)
|
||||
|
||||
text_str = str(element["id"])
|
||||
|
||||
if hasattr(draw, "textbbox"):
|
||||
bbox = draw.textbbox((0, 0), text_str, font=font)
|
||||
text_w, text_h = bbox[2]-bbox[0], bbox[3]-bbox[1]
|
||||
else:
|
||||
text_w, text_h = draw.textsize(text_str, font=font)
|
||||
|
||||
label_bg = [left, top - text_h - 4, left + text_w + 4, top]
|
||||
draw.rectangle(label_bg, fill=color)
|
||||
|
||||
draw.text((left + 2, top - text_h - 4), text_str, fill="white", font=font)
|
||||
|
||||
image.save(output_path)
|
||||
print(f"Visualization saved to: {output_path}")
|
||||
|
||||
except FileNotFoundError:
|
||||
print(f"Error: Image {image_path} not found.")
|
||||
except Exception as e:
|
||||
print(f"Visualization error: {e}")
|
||||
|
||||
575
mm_agents/os_symphony/agents/os_aci.py
Executable file
575
mm_agents/os_symphony/agents/os_aci.py
Executable file
@@ -0,0 +1,575 @@
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe
|
||||
from mm_agents.os_symphony.agents.coder_agent import CoderAgent
|
||||
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
|
||||
from mm_agents.os_symphony.agents.searcher_agent import SearcherAgent
|
||||
import logging
|
||||
from mm_agents.os_symphony.agents.ocr import OCRProcessor
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
# Agent action decorator
|
||||
def agent_action(func):
|
||||
func.is_agent_action = True
|
||||
return func
|
||||
|
||||
# GrounderAgent primitives are parameterized by description, and coordinate generation uses a pretrained grounding model
|
||||
class OSACI:
|
||||
def __init__(
|
||||
self,
|
||||
env,
|
||||
search_env,
|
||||
platform: str,
|
||||
client_password: str,
|
||||
engine_params_for_ocr: Dict,
|
||||
engine_params_for_grounder: Dict,
|
||||
engine_params_for_coder: Dict,
|
||||
engine_params_for_searcher: Dict,
|
||||
screen_width: int = 1920,
|
||||
screen_height: int = 1080
|
||||
):
|
||||
|
||||
self.env = env
|
||||
self.platform = platform
|
||||
self.client_password = client_password
|
||||
|
||||
self.result_dir = ""
|
||||
|
||||
self.grounder_agent = GrounderAgent(engine_params=engine_params_for_grounder, screen_width=screen_width, screen_height=screen_height)
|
||||
|
||||
# Configure text grounding agent
|
||||
self.ocr_processor = OCRProcessor()
|
||||
self.text_span_agent = LMMAgent(
|
||||
engine_params=engine_params_for_ocr,
|
||||
system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,
|
||||
)
|
||||
|
||||
# Configure code agent
|
||||
self.coder_agent = CoderAgent(
|
||||
engine_params=engine_params_for_coder,
|
||||
platform=self.platform,
|
||||
client_password=client_password
|
||||
)
|
||||
|
||||
# Configure search agent
|
||||
self.searcher_agent = SearcherAgent.create(
|
||||
engine_params=engine_params_for_searcher,
|
||||
search_env=search_env,
|
||||
grounder_agent=self.grounder_agent,
|
||||
platform=self.platform,
|
||||
client_password=self.client_password
|
||||
)
|
||||
|
||||
# Store task instruction for code agent
|
||||
self.current_task_instruction = None
|
||||
self.last_code_agent_result = None
|
||||
self.last_search_agent_result = None
|
||||
self.notes: List[str] = []
|
||||
# Tutorial should be a global info, not a local context, so how to add it to the global info
|
||||
self.tutorials = []
|
||||
|
||||
|
||||
def assign_screenshot(self, obs):
|
||||
self.obs = obs
|
||||
|
||||
# Given the state and worker's text phrase, generate the coords of the first/last word in the phrase
|
||||
def generate_text_coords(
|
||||
self, phrase: str, obs: Dict, alignment: str = ""
|
||||
) -> List[int]:
|
||||
|
||||
screenshot, global_offset_x, global_offset_y= obs["screenshot"], 0, 0
|
||||
|
||||
ocr_table, ocr_elements = self.ocr_processor.get_ocr_elements(screenshot, "easyocr")
|
||||
|
||||
alignment_prompt = ""
|
||||
if alignment == "start":
|
||||
alignment_prompt = "**Important**: Output the word id of the FIRST word in the provided phrase.\n"
|
||||
elif alignment == "end":
|
||||
alignment_prompt = "**Important**: Output the word id of the LAST word in the provided phrase.\n"
|
||||
|
||||
# Load LLM prompt
|
||||
self.text_span_agent.reset()
|
||||
self.text_span_agent.add_message(
|
||||
alignment_prompt + "Phrase: " + phrase + "\n" + ocr_table, role="user"
|
||||
)
|
||||
self.text_span_agent.add_message(
|
||||
"Screenshot:\n", image_content=screenshot, role="user"
|
||||
)
|
||||
|
||||
# Obtain the target element
|
||||
response = call_llm_safe(self.text_span_agent)
|
||||
print("TEXT SPAN AGENT RESPONSE:", response)
|
||||
numericals = re.findall(r"\d+", response)
|
||||
if len(numericals) > 0:
|
||||
text_id = int(numericals[-1])
|
||||
else:
|
||||
text_id = 0
|
||||
elem = ocr_elements[text_id]
|
||||
|
||||
# Compute the element coordinates
|
||||
# Note: 0.1 * elem["height"] is used to adjust coordinates to select the last character more precisely.
|
||||
if alignment == "start":
|
||||
coords = [elem["left"], elem["top"] + (elem["height"] // 2)]
|
||||
elif alignment == "end":
|
||||
coords = [elem["left"] + elem["width"] + 0.15 * elem["height"], elem["top"] + (elem["height"] // 2)]
|
||||
|
||||
print(f'[OCR] output coordinates: {[coords[0] + global_offset_x, coords[1] + global_offset_y]}')
|
||||
return [int(coords[0] + global_offset_x), int(coords[1] + global_offset_y)]
|
||||
|
||||
def set_task_instruction(self, task_instruction: str):
|
||||
"""Set the current task instruction for the code agent."""
|
||||
self.current_task_instruction = task_instruction
|
||||
|
||||
@agent_action
|
||||
def click(
|
||||
self,
|
||||
element_description: str,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
hold_keys: List = []
|
||||
):
|
||||
"""Click on the element
|
||||
Args:
|
||||
element_description:str, a detailed descriptions of which element to click on. This description needs to be VERY unambiguous. If the page contains many similar elements, ensure the description uniquely identifies the target element.
|
||||
num_clicks:int, number of times to click the element
|
||||
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
||||
hold_keys:List, list of keys to hold while clicking
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
|
||||
command = "import pyautogui; "
|
||||
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
# Return pyautoguicode to click on the element
|
||||
|
||||
action = {"function": "click", "args": {"x": x, "y": y, "button": button_type, "clicks": num_clicks}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def open(self, app_or_filename: str):
|
||||
"""Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
|
||||
Args:
|
||||
app_or_filename:str, the name of the application or filename to open
|
||||
|
||||
**Important**:
|
||||
Provide only the name of the application or file. Do not include the full path (e.g., "/home/user/Desktop/my_report.docx"). The function works by searching for the name, not by accessing a file path directly.
|
||||
"""
|
||||
action = {"function": "open", "args": {"name": app_or_filename}}
|
||||
if self.platform == "linux":
|
||||
return (f"import pyautogui; pyautogui.hotkey('win'); time.sleep(1.0); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(1.0)", action)
|
||||
elif self.platform == "macos":
|
||||
return (f"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)", action)
|
||||
elif self.platform == "windows":
|
||||
return (f"import pyautogui; import time; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.press('enter'); time.sleep(0.5)", action)
|
||||
else:
|
||||
assert (
|
||||
False
|
||||
), f"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows."
|
||||
|
||||
def _paste(self, is_terminal):
|
||||
if self.platform == 'macos':
|
||||
return "pyautogui.hotkey('command', 'v');"
|
||||
|
||||
elif self.platform == 'linux':
|
||||
if is_terminal:
|
||||
return "pyautogui.hotkey('ctrl', 'shift', 'v');"
|
||||
else:
|
||||
return "pyautogui.hotkey('ctrl', 'v');"
|
||||
|
||||
elif self.platform == 'windows':
|
||||
return "pyautogui.hotkey('ctrl', 'v');"
|
||||
|
||||
return ""
|
||||
|
||||
def _clear_all(self, is_terminal):
|
||||
"""
|
||||
Clean the content of current line
|
||||
"""
|
||||
# common apps in GUI
|
||||
if not is_terminal:
|
||||
if self.platform == 'macos':
|
||||
# macOS GUI: Command + A -> Backspace
|
||||
return "pyautogui.hotkey('command', 'a'); pyautogui.press('backspace');"
|
||||
else:
|
||||
# Windows/Linux GUI: Ctrl + A -> Backspace
|
||||
return "pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace');"
|
||||
|
||||
# terminal
|
||||
else:
|
||||
if self.platform == 'windows':
|
||||
return "pyautogui.press('esc');"
|
||||
else:
|
||||
return "pyautogui.hotkey('ctrl', 'e'); pyautogui.hotkey('ctrl', 'u');"
|
||||
|
||||
def _type(
|
||||
self,
|
||||
text: str,
|
||||
is_terminal: bool
|
||||
):
|
||||
"""
|
||||
use copy and paste to input Chinese, otherwise type normally
|
||||
"""
|
||||
commands = ""
|
||||
has_unicode = any(ord(char) > 127 for char in text)
|
||||
if has_unicode and self.platform != "macos":
|
||||
commands += (
|
||||
"original_clipboard = pyperclip.paste();"
|
||||
f"pyperclip.copy({repr(text)});"
|
||||
"time.sleep(0.1);"
|
||||
)
|
||||
commands += self._paste(is_terminal=is_terminal)
|
||||
commands += "pyperclip.copy(original_clipboard);"
|
||||
else:
|
||||
commands += f"pyautogui.write({repr(text)}, interval=0.1);"
|
||||
|
||||
return commands
|
||||
|
||||
@agent_action
|
||||
def type(
|
||||
self,
|
||||
element_description: str,
|
||||
text: str = "",
|
||||
overwrite: bool = False,
|
||||
enter: bool = False,
|
||||
is_terminal = False
|
||||
):
|
||||
"""Type text/unicode into a specific element
|
||||
Args:
|
||||
element_description: str, a detailed description of which element to enter text in. If provided, the agent will click on this element before typing.
|
||||
text:str, the text to type
|
||||
overwrite:bool, Default is False, assign it to True if the text should overwrite the whole existing text. Using this argument clears all text in an element.
|
||||
enter:bool, Assign it to True if the enter key should be pressed after typing all the text, otherwise assign it to False.
|
||||
is_terminal:bool, (MANDATORY) You MUST set this to True whenever the target you will type into is a terminal.
|
||||
"""
|
||||
commands = (
|
||||
"import os;"
|
||||
"import pyautogui;"
|
||||
"import pyperclip;"
|
||||
"import subprocess;"
|
||||
"import time;"
|
||||
)
|
||||
|
||||
|
||||
if self.platform == "linux":
|
||||
commands += (
|
||||
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
|
||||
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
|
||||
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
|
||||
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
|
||||
)
|
||||
|
||||
x, y = None, None
|
||||
if element_description is not None:
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
commands += (
|
||||
f"pyautogui.click({x}, {y}, clicks=2);"
|
||||
f"time.sleep(1.0);"
|
||||
f"pyautogui.click({x}, {y});"
|
||||
)
|
||||
|
||||
if overwrite:
|
||||
commands += self._clear_all(is_terminal=is_terminal)
|
||||
|
||||
commands += self._type(text=text, is_terminal=is_terminal)
|
||||
|
||||
if enter:
|
||||
commands += "pyautogui.press('enter');"
|
||||
|
||||
if element_description is not None:
|
||||
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
||||
else:
|
||||
action = {"function": "type", "args": {"text": text}}
|
||||
return (commands, action)
|
||||
|
||||
@agent_action
|
||||
def drag_and_drop(
|
||||
self, starting_description: str, ending_description: str, hold_keys: List = []
|
||||
):
|
||||
"""Drag from the starting description to the ending description
|
||||
Args:
|
||||
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
|
||||
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
|
||||
hold_keys:List list of keys to hold while dragging
|
||||
"""
|
||||
x1, y1 = self.grounder_agent.generate_coords(starting_description, self.obs)
|
||||
x2, y2 = self.grounder_agent.generate_coords(ending_description, self.obs)
|
||||
|
||||
command = "import pyautogui; "
|
||||
|
||||
command += f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# TODO: specified duration?
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=3., button='left'); pyautogui.mouseUp(); "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
|
||||
# Return pyautoguicode to drag and drop the elements
|
||||
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def highlight_text_span(
|
||||
self,
|
||||
starting_phrase: str,
|
||||
ending_phrase: str,
|
||||
button: str = "left",
|
||||
text: Optional[str|None] = None
|
||||
):
|
||||
"""Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.
|
||||
Args:
|
||||
starting_phrase: str, the sequence of words that marks the beginning of the text span. Provide a unique sequence of 5 to 10 words.
|
||||
ending_phrase: str, the sequence of words that marks the end of the text span. Provide a unique sequence of 5 to 10 words.
|
||||
button:str, the button to use to highlight the text span. Defaults to "left". Can be "left", "right", or "middle".
|
||||
text: str | None, The text to overwrite the highlighted span with. Providing text here ensures the replacement happens immediately after selection, preventing focus loss.
|
||||
"""
|
||||
x1, y1 = self.generate_text_coords(
|
||||
starting_phrase, self.obs, alignment="start"
|
||||
)
|
||||
x2, y2 = self.generate_text_coords(
|
||||
ending_phrase, self.obs, alignment="end"
|
||||
)
|
||||
|
||||
command = "import pyautogui; import time;"
|
||||
command += f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# Click in advance to simulate selecting the text box.
|
||||
command += (
|
||||
f"pyautogui.click({x1}, {y1}, clicks=2);"
|
||||
f"time.sleep(1.0); pyautogui.click({x1}, {y1}); time.sleep(1.0);"
|
||||
)
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=5., button='{button}'); time.sleep(0.5); pyautogui.mouseUp(); "
|
||||
|
||||
if text:
|
||||
if self.platform == "linux":
|
||||
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
||||
|
||||
command += (
|
||||
"original_clipboard = pyperclip.paste();"
|
||||
f"pyperclip.copy({repr(text)});"
|
||||
)
|
||||
command += self._paste(is_terminal=False)
|
||||
command += "pyperclip.copy(original_clipboard);"
|
||||
|
||||
# Return pyautoguicode to drag and drop the elements
|
||||
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def locate_cursor(
|
||||
self,
|
||||
phrase: str,
|
||||
start_or_end: str="start",
|
||||
text: Optional[str|None] = None
|
||||
):
|
||||
"""Click at the beginning or end of a specific text phrase to precisely control cursor positioning. Please prefer using the "click" action in general situations, and use this action only in text-intensive software such as libreoffice_writer, impress, etc.
|
||||
|
||||
Args:
|
||||
phrase: str, The text phrase where you want to position the cursor. Provide a unique sequence of 5 to 10 words. Do NOT use single words unless the total text is extremely short.
|
||||
start_or_end: str, Whether to click at the "start" (beginning) or "end" (trailing edge) of the identified text phrase. Use "start" to position before the text, "end" to position after it.
|
||||
text: str | None, The text to enter immediately after positioning the cursor. Use this parameter instead of a separate 'type' action to ensure precise input.
|
||||
"""
|
||||
x, y = self.generate_text_coords(
|
||||
phrase, self.obs, alignment=start_or_end
|
||||
)
|
||||
command = (
|
||||
"import pyautogui;"
|
||||
"import time;"
|
||||
"import subprocess;"
|
||||
"import pyperclip;"
|
||||
f"pyautogui.click({x}, {y}, button='left', clicks=2);"
|
||||
"time.sleep(1.0);"
|
||||
f"pyautogui.click({x}, {y}, button='left');"
|
||||
)
|
||||
if text:
|
||||
if self.platform == "linux":
|
||||
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
||||
|
||||
command += self._type(text=text, is_terminal=False)
|
||||
|
||||
if text:
|
||||
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
||||
else:
|
||||
action = {"function": "click", "args": {"x": x, "y": y, "clicks": 1, "button": "left"}}
|
||||
return (command, action)
|
||||
|
||||
|
||||
@agent_action
|
||||
def call_code_agent(self, task: str):
|
||||
"""Calls the code agent to execute a well-defined, self-contained goal that can be completed with code.
|
||||
|
||||
Args:
|
||||
task: str, A specific, self-contained goal that the code agent can work on until completion.
|
||||
|
||||
**🚨 CRITICAL GUIDELINES:**
|
||||
|
||||
**Decompose the Main Objective into Logical Goals:**
|
||||
- You **MUST** break down the overall mission into distinct, logical goals or stages.
|
||||
- Your role is to define *what* needs to be done for a specific stage. The code agent's role is to figure out *how* to do it with code.
|
||||
- Pass only one logical goal at a time. The `task` parameter is **REQUIRED**.
|
||||
|
||||
**Define a Self-Contained, Continuous Goal:**
|
||||
- The `task` you provide should be a single, continuous goal. The code agent is capable of handling a multi-step process internally (e.g., opening a file, processing its data, and then saving it) to achieve this one goal.
|
||||
- **Crucially, do not pass a task that combines multiple distinct objectives.** For example, instead of passing "Analyze the sales data, AND email the result," you should first pass the self-contained goal: "Analyze the sales data." After that goal is complete, you can proceed with the next logical goal (e.g., emailing the result) in a subsequent step.
|
||||
- **If unsure, err on the side of caution.** If a task feels like it has two separate parts, break it down and pass only the first part.
|
||||
- Your instruction must describe the desired end-state, NOT the recipe to get there. Do not specify any solution!
|
||||
|
||||
**Goal Purity is Essential:**
|
||||
- **NEVER** rephrase, paraphrase, or modify the subtask instruction you have decided on. Pass the exact, original wording of the subtask to prevent instruction drift and hallucination.
|
||||
|
||||
Use this for tasks that can be fully accomplished through code execution, particularly for:
|
||||
- Spreadsheet applications: data processing, filtering, sorting, calculations, formulas, data analysis
|
||||
- Document editors: text processing, content editing, formatting, document manipulation
|
||||
- Code editors: code editing, file processing, text manipulation, configuration
|
||||
- Data analysis tools: statistical analysis, data transformation, reporting
|
||||
- File management: bulk operations, file processing, content extraction
|
||||
- System utilities: configuration, setup, automation
|
||||
"""
|
||||
logger.info("=" * 50)
|
||||
logger.info("ACI: Calling Code Agent")
|
||||
logger.info("=" * 50)
|
||||
task_to_execute = task
|
||||
logger.info(f"Executing SUBTASK: {task_to_execute}")
|
||||
|
||||
print("obs keys: ", self.obs.keys())
|
||||
screenshot = self.obs.get("screenshot", "") if self.obs else ""
|
||||
logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}")
|
||||
|
||||
logger.info("Executing code agent...")
|
||||
|
||||
result = self.coder_agent.execute(
|
||||
task_to_execute, screenshot, self.env.controller
|
||||
)
|
||||
|
||||
# Store the result for the worker to access
|
||||
self.last_code_agent_result = result
|
||||
|
||||
logger.info("Code agent execution completed")
|
||||
logger.info(f"Result - Completion reason: {result['completion_reason']}")
|
||||
logger.info(f"Steps executed: {result['steps_executed']}")
|
||||
logger.info(f"Summary: {result['summary']}")
|
||||
|
||||
logger.info("=" * 50)
|
||||
logger.info("GROUNDING AGENT: Code Agent Call Finished")
|
||||
logger.info("=" * 50)
|
||||
|
||||
action = {"function": "call_code_agent", "args": {"query": task, "result": True if result["completion_reason"] == "DONE" else False}}
|
||||
# Return code to be executed in the environment
|
||||
return ("import time; time.sleep(2.222)", action)
|
||||
|
||||
@agent_action
|
||||
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
||||
"""Scroll the element in the specified direction
|
||||
Args:
|
||||
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
||||
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
||||
shift:bool, whether to use shift+scroll for horizontal scrolling
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
action = {"function": "scroll", "args": {"x": x, "y": y, "clicks": clicks, "shift": shift}}
|
||||
if shift:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", action)
|
||||
else:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", action)
|
||||
|
||||
@agent_action
|
||||
def hotkey(self, keys: List):
|
||||
"""Press a hotkey combination (can press a single key as well)
|
||||
Args:
|
||||
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
keys_string = " ".join(keys)
|
||||
action = {"function": "key", "args": {"keys": keys_string}}
|
||||
return (f"import pyautogui; pyautogui.hotkey({', '.join(keys)});", action)
|
||||
|
||||
@agent_action
|
||||
def hold_and_press(self, hold_keys: List, press_keys: List):
|
||||
"""Hold a list of keys and press a list of keys
|
||||
Args:
|
||||
hold_keys:List, list of keys to hold
|
||||
press_keys:List, list of keys to press in a sequence
|
||||
"""
|
||||
|
||||
press_keys_str = "[" + ", ".join([f"'{key}'" for key in press_keys]) + "]"
|
||||
command = "import pyautogui; "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"pyautogui.press({press_keys_str}); "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
|
||||
hold_keys_string = " ".join(hold_keys)
|
||||
press_keys_string = " ".join(press_keys)
|
||||
action = {"function": "key", "args": {"keys": hold_keys_string + ";" + press_keys_string}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def wait(self, time: float):
|
||||
"""Wait for a specified amount of time
|
||||
Args:
|
||||
time:float, the amount of time to wait in seconds
|
||||
"""
|
||||
return (f"""import time; time.sleep({time});""", {"function": "wait", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def done(
|
||||
self,
|
||||
):
|
||||
"""
|
||||
End the current task with a success. Use this when you believe the entire task has been fully completed. You must ensure all visual information aligns with the user's true intent.
|
||||
"""
|
||||
return ("""DONE""", {"function": "done", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def fail(self):
|
||||
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete."""
|
||||
return ("""FAIL""", {"function": "fail", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def call_search_agent(
|
||||
self,
|
||||
query: str,
|
||||
):
|
||||
"""
|
||||
Calls a specialized 'Searcher Agent' to find a detailed, step-by-step tutorial on the internet for a specific GUI action.
|
||||
Args:
|
||||
query:str, the search phrase or question for the tutorial. The formulation of this query is critical for success and must follow the guidelines below.
|
||||
|
||||
**Query Formulation Guidelines:**
|
||||
|
||||
Your query must be a well-defined question targeting a **single, specific action** within a **specific application**. To get the best results, adhere to these rules:
|
||||
|
||||
1. **Start with "How to":** Your query must begin with the phrase "How to" to frame it as a request for instructions.
|
||||
2. **Include the Application Name:** Always specify the name of the software you are working in (e.g., "GIMP", "Google Chrome", "Libreoffice Writer").
|
||||
3. **Focus on a Single Intent:** The query should represent one clear goal. Do not combine multiple steps or tasks into one query.
|
||||
4. **Be Specific, Not Abstract:** Ask a concrete question. Avoid repeating the user's high-level or abstract instructions.
|
||||
5. **Decompose Complex Tasks:** If the user's overall instruction involves multiple actions (e.g., "download a file and then email it"), and you are stuck on one part, search *only for that specific part*.
|
||||
|
||||
**Examples:**
|
||||
|
||||
* **User's Overall Instruction:** "Please help me download my latest bank statement and then send it to my accountant."
|
||||
* **Correct Query (if stuck on downloading):** "How to download a bank statement from the Bank of America website?"
|
||||
* **Correct Query (if stuck on attaching a file):** "How to attach a file to an email in Gmail?"
|
||||
* **Incorrect Query:** "Download my bank statement and email it to my accountant" *(This query is too broad, contains multiple sub-tasks, and does not start with "How to".)*
|
||||
"""
|
||||
logger.info("=" * 50)
|
||||
logger.info(f"ACI: Calling Search Agent(query={query})")
|
||||
logger.info("=" * 50)
|
||||
self.searcher_agent.result_dir = self.result_dir
|
||||
result = self.searcher_agent.search(query=query, main_obs=self.obs)
|
||||
self.last_search_agent_result = result
|
||||
if result["completion_reason"] == "DONE":
|
||||
self.tutorials.append(result["final_answer"])
|
||||
action = {"function": "call_search_agent", "args": {"query": query, "result": True if result["completion_reason"] == "DONE" else False}}
|
||||
return ("import time; time.sleep(2.222)", action)
|
||||
|
||||
61
mm_agents/os_symphony/agents/os_symphony.py
Executable file
61
mm_agents/os_symphony/agents/os_symphony.py
Executable file
@@ -0,0 +1,61 @@
|
||||
import logging
|
||||
import platform
|
||||
from typing import Dict, List, Tuple
|
||||
from mm_agents.os_symphony.agents.os_aci import OSACI
|
||||
from mm_agents.os_symphony.agents.searcher_agent import VLMSearcherAgent
|
||||
from mm_agents.os_symphony.agents.worker import Worker
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
class OSSymphony:
|
||||
def __init__(
|
||||
self,
|
||||
engine_params_for_orchestrator: Dict,
|
||||
engine_params_for_memoryer: Dict,
|
||||
os_aci: OSACI,
|
||||
platform: str = platform.system().lower(),
|
||||
client_password: str = "",
|
||||
max_trajectory_length: int = 8,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""
|
||||
Args:
|
||||
worker_engine_params: Configuration parameters for the worker agent.
|
||||
grounding_agent: Instance of ACI class for UI interaction
|
||||
platform: Operating system platform (darwin, linux, windows)
|
||||
max_trajectory_length: Maximum number of image turns to keep
|
||||
enable_reflection: Creates a reflection agent to assist the worker agent
|
||||
"""
|
||||
|
||||
self.engine_params_for_orchestrator = engine_params_for_orchestrator
|
||||
self.engine_params_for_memoryer = engine_params_for_memoryer
|
||||
self.os_aci: OSACI = os_aci
|
||||
self.platform =platform
|
||||
self.client_password = client_password
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.enable_reflection = enable_reflection
|
||||
|
||||
def reset(self, result_dir) -> None:
|
||||
"""Reset agent state and initialize components"""
|
||||
# Reset the search time per task
|
||||
self.os_aci.result_dir = result_dir
|
||||
self.executor = Worker(
|
||||
engine_params_for_orchestrator=self.engine_params_for_orchestrator,
|
||||
engine_params_for_memoryer=self.engine_params_for_memoryer,
|
||||
os_aci=self.os_aci,
|
||||
platform=self.platform,
|
||||
client_password=self.client_password,
|
||||
max_trajectory_length=self.max_trajectory_length,
|
||||
enable_reflection=self.enable_reflection,
|
||||
)
|
||||
|
||||
def predict(self, instruction: str, observation: Dict, is_last_step: bool) -> Tuple[Dict, List[str]]:
|
||||
# Initialize the three info dictionaries
|
||||
executor_info, actions = self.executor.generate_next_action(
|
||||
instruction=instruction, obs=observation, is_last_step=is_last_step
|
||||
)
|
||||
|
||||
# concatenate the three info dictionaries
|
||||
info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
|
||||
|
||||
return info, actions
|
||||
478
mm_agents/os_symphony/agents/searcher_agent.py
Executable file
478
mm_agents/os_symphony/agents/searcher_agent.py
Executable file
@@ -0,0 +1,478 @@
|
||||
import logging
|
||||
import urllib.parse
|
||||
from typing import Any, Dict, List, Optional
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.utils.common_utils import (
|
||||
draw_coordinates,
|
||||
call_llm_formatted,
|
||||
parse_code_from_string,
|
||||
create_pyautogui_code
|
||||
)
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.searcher_agent")
|
||||
|
||||
# Agent action decorator
|
||||
def searcher_agent_action(func):
|
||||
func.is_searcher_agent_action = True
|
||||
return func
|
||||
|
||||
|
||||
# --- Abstract Base Class and Factory ---
|
||||
class SearcherAgent:
|
||||
def __init__(self, engine_params: Dict, platform: str):
|
||||
self.engine_params = engine_params
|
||||
self.result_dir = ""
|
||||
self.tutorial_or_hint = ""
|
||||
self.tutorial_notes = []
|
||||
self.max_trajectory_length = 8
|
||||
self.platform = platform
|
||||
self.budget = engine_params.get("budget", 20)
|
||||
|
||||
@staticmethod
|
||||
def create(engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str="password"):
|
||||
searcher_type = engine_params.get("type", "vlm")
|
||||
if searcher_type == "vlm":
|
||||
return VLMSearcherAgent(engine_params=engine_params, search_env=search_env, grounder_agent=grounder_agent, platform=platform, client_password=client_password)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
def _get_search_time(self) -> int:
|
||||
"""for the name of result directory"""
|
||||
if not self.result_dir: return 1
|
||||
search_times: list[int] = []
|
||||
try:
|
||||
if not os.path.exists(self.result_dir): return 1
|
||||
for item_name in os.listdir(self.result_dir):
|
||||
full_path = os.path.join(self.result_dir, item_name)
|
||||
if os.path.isdir(full_path) and item_name.startswith("search_"):
|
||||
try:
|
||||
time_val = int(item_name.split('_', 1)[1])
|
||||
search_times.append(time_val)
|
||||
except (ValueError, IndexError):
|
||||
continue
|
||||
except Exception:
|
||||
return 1
|
||||
if not search_times: return 1
|
||||
return max(search_times) + 1
|
||||
|
||||
def search(self, query: str, obs) -> str:
|
||||
"""
|
||||
Args:
|
||||
query: Format like "How to xxxx?", must be a detailed subtask
|
||||
obs: Current screenshot
|
||||
"""
|
||||
raise NotImplementedError("Subclasses must implement the 'search' method")
|
||||
|
||||
class VLMSearcherAgent(SearcherAgent):
|
||||
"""
|
||||
Start a new, isolated vm, and open chrome in advance
|
||||
"""
|
||||
def __init__(self, engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str):
|
||||
SearcherAgent.__init__(self, engine_params=engine_params, platform=platform)
|
||||
|
||||
self.grounder_agent = grounder_agent
|
||||
self.client_password = client_password
|
||||
self.env = search_env
|
||||
|
||||
self.use_thinking = engine_params.get("model", "") in [
|
||||
"claude-opus-4-20250514",
|
||||
"claude-sonnet-4-20250514",
|
||||
"claude-3-7-sonnet-20250219",
|
||||
"claude-sonnet-4-5-20250929",
|
||||
]
|
||||
|
||||
self.engine = engine_params.get("engine", "google")
|
||||
|
||||
# Reuse OSWorld's initialization script to set up Chrome, then directly perform a Google search using the query—currently, the query can be substituted by a placeholder field.
|
||||
self.task_config = {
|
||||
"id": "searcher",
|
||||
"instruction": "searcher",
|
||||
"config": [
|
||||
{
|
||||
"type": "launch",
|
||||
"parameters": {
|
||||
"command": [
|
||||
"google-chrome",
|
||||
"--remote-debugging-port=1337"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "launch",
|
||||
"parameters": {
|
||||
"command": [
|
||||
"socat",
|
||||
"tcp-listen:9222,fork",
|
||||
"tcp:localhost:1337"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "chrome_open_tabs",
|
||||
"parameters": {
|
||||
"urls_to_open": [
|
||||
"GOOGLE_SEARCH_URL"
|
||||
]
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "activate_window",
|
||||
"parameters": {
|
||||
"window_name": "Google Chrome"
|
||||
}
|
||||
}
|
||||
],
|
||||
"proxy": True
|
||||
}
|
||||
self.obs = None
|
||||
|
||||
def reset(self, query):
|
||||
# When the search function is invoked, a new agent is created; the environment is instantiated only upon the first call, but it must be reset on every invocation.
|
||||
self.tutorial_notes = []
|
||||
self.tutorial_or_hint = ""
|
||||
self.system_prompt = PROCEDURAL_MEMORY.construct_vlm_searcher_procedural_memory(
|
||||
agent_class=type(self)
|
||||
).replace("CURRENT_OS", self.platform).replace("QUERY", query)
|
||||
self.searcher_agent = LMMAgent(
|
||||
engine_params=self.engine_params,
|
||||
system_prompt=self.system_prompt
|
||||
)
|
||||
self.env.start()
|
||||
# config URL and initialize search environment (google/duckduckgo)
|
||||
search_url = f"https://www.google.com/search?q=" + urllib.parse.quote_plus(query) if self.engine == "google" else f"https://www.duckduckgo.com/?q=" + urllib.parse.quote_plus(query)
|
||||
self.task_config["config"][2]["parameters"]["urls_to_open"][0] = search_url
|
||||
|
||||
self.env.reset(task_config=self.task_config)
|
||||
print("[Searcher] sleeping...")
|
||||
time.sleep(5)
|
||||
|
||||
def flush_messages(self):
|
||||
"""Flush messages based on the model's context limits.
|
||||
|
||||
This method ensures that the agent's message history does not exceed the maximum trajectory length.
|
||||
|
||||
Side Effects:
|
||||
- Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
|
||||
"""
|
||||
engine_type = self.engine_params.get("engine_type", "")
|
||||
|
||||
# Flush strategy for long-context models: keep all text, only keep latest images
|
||||
if engine_type in ["anthropic", "openai", "gemini"]:
|
||||
max_images = self.max_trajectory_length
|
||||
for agent in [self.searcher_agent]:
|
||||
if agent is None:
|
||||
continue
|
||||
# keep latest k images
|
||||
# @Yang: keep the first main agent image
|
||||
img_count = 0
|
||||
for i in range(len(agent.messages) - 1, 1, -1):
|
||||
for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
|
||||
if "image" in agent.messages[i]["content"][j].get("type", ""):
|
||||
img_count += 1
|
||||
if img_count > max_images:
|
||||
del agent.messages[i]["content"][j]
|
||||
|
||||
# Flush strategy for non-long-context models: drop full turns
|
||||
else:
|
||||
# generator msgs are alternating [user, assistant], so 2 per round
|
||||
if len(self.searcher_agent.messages) > 2 * self.max_trajectory_length + 1:
|
||||
self.searcher_agent.messages.pop(1)
|
||||
self.searcher_agent.messages.pop(1)
|
||||
|
||||
def assign_screenshot(self, obs):
|
||||
self.obs = obs
|
||||
|
||||
def search(self, query: str, main_obs):
|
||||
# only create vm when search is called
|
||||
self.reset(query=query) # reset
|
||||
search_result_dir = os.path.join(self.result_dir, f"search_{self._get_search_time()}")
|
||||
os.makedirs(search_result_dir, exist_ok=True)
|
||||
|
||||
obs = self.env._get_obs() # Get the initial observation
|
||||
step_idx = 0
|
||||
initial_state_text = (
|
||||
"This screenshot shows the current visual context of the main GUI Agent you are assisting. "
|
||||
"Use this image to understand the application, the current view, and the overall environment. "
|
||||
"Your primary goal is to find a tutorial that is highly relevant and well-aligned with this specific context, "
|
||||
"ensuring the instructions you find are applicable to what the main agent is currently seeing."
|
||||
)
|
||||
self.searcher_agent.add_message(
|
||||
text_content=initial_state_text,
|
||||
image_content=main_obs["screenshot"],
|
||||
role="user"
|
||||
)
|
||||
execution_history = []
|
||||
completion_reason = ""
|
||||
final_answer = ""
|
||||
|
||||
while step_idx < self.budget:
|
||||
# update system_prompt dynamically
|
||||
tutorial_notes_str = ""
|
||||
if len(self.tutorial_notes) > 0:
|
||||
for i, note in enumerate(self.tutorial_notes, 1):
|
||||
tutorial_notes_str += f"Tutorial Note {i}: {note}\n\n"
|
||||
|
||||
if step_idx == self.budget - 1:
|
||||
# eager mode
|
||||
self.system_prompt = PROCEDURAL_MEMORY.construct_searcher_eager_mode_procedural_memory(
|
||||
agent_class=type(self)
|
||||
).replace("CURRENT_OS", self.platform).replace("QUERY", query)
|
||||
|
||||
system_prompt = self.system_prompt.replace("TUTORIAL_PLACEHOLDER", tutorial_notes_str)
|
||||
self.searcher_agent.add_system_prompt(system_prompt=system_prompt)
|
||||
|
||||
# start a new turn
|
||||
self.assign_screenshot(obs=obs)
|
||||
generator_message = ""
|
||||
|
||||
self.searcher_agent.add_message(
|
||||
generator_message, image_content=obs["screenshot"], role="user"
|
||||
)
|
||||
format_checkers = []
|
||||
|
||||
# predict action
|
||||
plan = call_llm_formatted(
|
||||
self.searcher_agent,
|
||||
format_checkers,
|
||||
temperature=self.engine_params.get("temperture", 0.1),
|
||||
use_thinking=self.use_thinking,
|
||||
)
|
||||
|
||||
self.searcher_agent.add_message(plan, role="assistant")
|
||||
execution_history.append(plan)
|
||||
logger.info("SEARCHER PLAN:\n %s", plan)
|
||||
|
||||
plan_code = parse_code_from_string(plan)
|
||||
try:
|
||||
assert plan_code, "Plan code should not be empty"
|
||||
# exec_code e.g. import pyautogui; pyautogui.click(1, 2);
|
||||
exec_code, coords = create_pyautogui_code(self, plan_code, obs)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
|
||||
)
|
||||
exec_code = self.wait(
|
||||
1.333
|
||||
) # Skip a turn if the code cannot be evaluated
|
||||
|
||||
self.flush_messages()
|
||||
|
||||
# execute action
|
||||
action = exec_code
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(search_result_dir, f"step_{step_idx + 1}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
|
||||
if coords is not None and isinstance(coords, list):
|
||||
draw_coordinates(
|
||||
image_bytes=obs['screenshot'],
|
||||
coordinates=coords,
|
||||
save_path=os.path.join(search_result_dir, f"step_{step_idx + 1}_draw.png")
|
||||
)
|
||||
|
||||
with open(os.path.join(search_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps({
|
||||
"query": query,
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": {
|
||||
"plan": plan,
|
||||
"plan_code": plan_code,
|
||||
"coordinates": coords
|
||||
},
|
||||
"screenshot_file": f"step_{step_idx + 1}.png"
|
||||
}, ensure_ascii=False))
|
||||
f.write("\n")
|
||||
|
||||
with open(os.path.join(search_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
|
||||
json.dump({
|
||||
"query": query,
|
||||
"step_num": step_idx + 1,
|
||||
"action": action,
|
||||
"response": {
|
||||
"plan": plan,
|
||||
"plan_code": plan_code,
|
||||
"coordinates": coords
|
||||
},
|
||||
"screenshot_file": f"step_{step_idx + 1}.png"
|
||||
}, f, indent=4, ensure_ascii=False)
|
||||
|
||||
if exec_code in ["DONE", "FAIL"]:
|
||||
# terminate loop
|
||||
completion_reason = exec_code
|
||||
final_answer = self.tutorial_or_hint
|
||||
break
|
||||
else:
|
||||
obs, _, _, _ = self.env.step(action, 5)
|
||||
|
||||
step_idx += 1
|
||||
|
||||
if completion_reason == "":
|
||||
completion_reason = "BUDGET_EXHAUSTED"
|
||||
final_answer = "Sorry, can't get the useful tutorial about the GUI task you provided."
|
||||
|
||||
return {
|
||||
"query": query,
|
||||
"completion_reason": completion_reason,
|
||||
"tutorial_notes": self.tutorial_notes,
|
||||
"execution_history": execution_history,
|
||||
"steps_executed": step_idx,
|
||||
"budget": self.budget,
|
||||
"final_answer": final_answer,
|
||||
}
|
||||
|
||||
@searcher_agent_action
|
||||
def click(
|
||||
self,
|
||||
element_description: str,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
):
|
||||
"""Click on the element
|
||||
Args:
|
||||
element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.
|
||||
num_clicks:int, number of times to click the element
|
||||
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
command = "import pyautogui; "
|
||||
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
|
||||
|
||||
# Return pyautoguicode to click on the element
|
||||
return (command, [x, y])
|
||||
|
||||
@searcher_agent_action
|
||||
def type(
|
||||
self,
|
||||
element_description: Optional[str] = None,
|
||||
text: str = "",
|
||||
overwrite: bool = True,
|
||||
enter: bool = False
|
||||
):
|
||||
"""Type text/unicode into a specific element
|
||||
Args:
|
||||
element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.
|
||||
text:str, the text to type
|
||||
overwrite:bool, Default is True, assign it to False if the text should not overwrite the existing text. Using this argument clears all text in an element.
|
||||
enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
|
||||
"""
|
||||
commands = (
|
||||
"import os;"
|
||||
"import pyautogui;"
|
||||
"import pyperclip;"
|
||||
"import subprocess;"
|
||||
"import time;"
|
||||
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
|
||||
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
|
||||
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
|
||||
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
|
||||
)
|
||||
|
||||
|
||||
|
||||
click_coords = None
|
||||
if element_description is not None:
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
click_coords = [x, y]
|
||||
|
||||
commands += f"pyautogui.click({x}, {y});"
|
||||
|
||||
if overwrite:
|
||||
commands += (
|
||||
f"pyautogui.hotkey('ctrl', 'a');"
|
||||
"pyautogui.press('backspace');"
|
||||
)
|
||||
|
||||
# use paste to input
|
||||
commands += (
|
||||
"original_clipboard = pyperclip.paste();"
|
||||
f"pyperclip.copy({repr(text)});"
|
||||
"pyautogui.hotkey('ctrl', 'v');"
|
||||
"pyperclip.copy(original_clipboard);"
|
||||
)
|
||||
|
||||
if enter:
|
||||
commands += "pyautogui.press('enter');"
|
||||
|
||||
if click_coords is not None:
|
||||
return (commands, click_coords)
|
||||
else:
|
||||
return commands
|
||||
|
||||
@searcher_agent_action
|
||||
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
||||
"""Scroll the element in the specified direction
|
||||
Args:
|
||||
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
||||
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
||||
shift:bool, whether to use shift+scroll for horizontal scrolling
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
|
||||
if shift:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", [x, y])
|
||||
else:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", [x, y])
|
||||
|
||||
@searcher_agent_action
|
||||
def hotkey(self, keys: List):
|
||||
"""Press a hotkey combination (can press a single key as well)
|
||||
Args:
|
||||
keys: List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)})"
|
||||
|
||||
@searcher_agent_action
|
||||
def save_to_tutorial_notes(self, text: str):
|
||||
"""Save high quality and useful information to a long-term knowledge bank for reuse during this search task.
|
||||
Args:
|
||||
text:str, the text to save to the tutorial notes
|
||||
"""
|
||||
self.tutorial_notes.append(text)
|
||||
return """WAIT"""
|
||||
|
||||
@searcher_agent_action
|
||||
def wait(self, time: float):
|
||||
"""Wait for a specified amount of time
|
||||
Args:
|
||||
time:float the amount of time to wait in seconds
|
||||
"""
|
||||
return f"""import time; time.sleep({time})"""
|
||||
|
||||
@searcher_agent_action
|
||||
def done(
|
||||
self,
|
||||
tutorial: str
|
||||
):
|
||||
"""End the current task with a success. Use this when you believe the entire task has been fully completed.
|
||||
Args:
|
||||
tutorial:str, A detailed, step-by-step tutorial compiled from the search results to be passed to the main agent.
|
||||
"""
|
||||
self.tutorial_or_hint = tutorial
|
||||
return """DONE"""
|
||||
|
||||
@searcher_agent_action
|
||||
def fail(
|
||||
self,
|
||||
hint: str
|
||||
):
|
||||
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete.
|
||||
Args:
|
||||
hint:str, A hint or reason explaining why the search failed, or what kind of information was missing.
|
||||
"""
|
||||
self.tutorial_or_hint = hint
|
||||
return """FAIL"""
|
||||
|
||||
|
||||
|
||||
340
mm_agents/os_symphony/agents/worker.py
Executable file
340
mm_agents/os_symphony/agents/worker.py
Executable file
@@ -0,0 +1,340 @@
|
||||
from functools import partial
|
||||
import logging
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from mm_agents.os_symphony.agents.memoryer_agent import ReflectionMemoryAgent
|
||||
from mm_agents.os_symphony.agents.os_aci import OSACI
|
||||
from mm_agents.os_symphony.core.module import BaseModule
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.utils.common_utils import (
|
||||
call_llm_formatted,
|
||||
extract_coords_from_action_dict,
|
||||
parse_action_from_string,
|
||||
parse_code_from_string,
|
||||
create_pyautogui_code,
|
||||
)
|
||||
from mm_agents.os_symphony.utils.formatters import (
|
||||
SINGLE_ACTION_FORMATTER,
|
||||
CODE_VALID_FORMATTER,
|
||||
)
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class Worker(BaseModule):
|
||||
def __init__(
|
||||
self,
|
||||
engine_params_for_orchestrator: Dict,
|
||||
engine_params_for_memoryer: Dict,
|
||||
os_aci: OSACI,
|
||||
platform: str,
|
||||
client_password: str,
|
||||
max_trajectory_length: int = 8,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""
|
||||
Worker receives the main task and generates actions, without the need of hierarchical planning
|
||||
Args:
|
||||
worker_engine_params: Dict
|
||||
Parameters for the worker agent
|
||||
os_aci: Agent
|
||||
The grounding agent to use
|
||||
platform: str
|
||||
OS platform the agent runs on (darwin, linux, windows)
|
||||
max_trajectory_length: int
|
||||
The amount of images turns to keep
|
||||
enable_reflection: bool
|
||||
Whether to enable reflection
|
||||
"""
|
||||
super().__init__(platform=platform)
|
||||
self.client_password = client_password
|
||||
|
||||
self.temperature = engine_params_for_orchestrator.get("temperature", 0.0)
|
||||
self.tool_config = engine_params_for_orchestrator.get("tool_config", "")
|
||||
self.use_thinking = engine_params_for_orchestrator.get("model", "") in [
|
||||
"claude-opus-4-20250514",
|
||||
"claude-sonnet-4-20250514",
|
||||
"claude-3-7-sonnet-20250219",
|
||||
"claude-sonnet-4-5-20250929",
|
||||
]
|
||||
self.engine_params_for_orchestrator = engine_params_for_orchestrator
|
||||
self.engine_params_for_memoryer = engine_params_for_memoryer
|
||||
self.os_aci: OSACI = os_aci
|
||||
|
||||
self.max_trajectory_length = max_trajectory_length if not self.engine_params_for_orchestrator.get("keep_first_image", False) else max_trajectory_length - 1
|
||||
self.enable_reflection = enable_reflection
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
# set_cell_values only occurs in linux; meanwhile there is no fail option in the other benchmarks
|
||||
if self.platform in ["windows", "macos"]:
|
||||
skipped_actions = ["set_cell_values", "fail"]
|
||||
else:
|
||||
skipped_actions = []
|
||||
|
||||
# Hide code agent action entirely if no env/controller is available
|
||||
if not getattr(self.os_aci, "env", None) or not getattr(
|
||||
getattr(self.os_aci, "env", None), "controller", None
|
||||
):
|
||||
skipped_actions.append("call_code_agent")
|
||||
|
||||
self.orchestrator_sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(
|
||||
agent_class=type(self.os_aci),
|
||||
skipped_actions=skipped_actions,
|
||||
tool_config=self.tool_config,
|
||||
platform=self.platform
|
||||
).replace("CURRENT_OS", self.platform).replace("CLIENT_PASSWORD", self.client_password)
|
||||
|
||||
# Worker contains orchestrator and reflection agent
|
||||
self.orchestrator_agent = self._create_agent(
|
||||
engine_params=self.engine_params_for_orchestrator,
|
||||
system_prompt=self.orchestrator_sys_prompt
|
||||
|
||||
)
|
||||
self.memoryer_agent = ReflectionMemoryAgent(self.engine_params_for_memoryer)
|
||||
|
||||
self.instruction = None
|
||||
self.turn_count = 0
|
||||
self.worker_history = []
|
||||
self.coords_history = []
|
||||
|
||||
# For loop detection
|
||||
self.action_dict_history = []
|
||||
|
||||
def flush_messages(self):
|
||||
"""Flush messages based on the model's context limits.
|
||||
|
||||
This method ensures that the agent's message history does not exceed the maximum trajectory length.
|
||||
|
||||
Side Effects:
|
||||
- Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
|
||||
"""
|
||||
engine_type = self.engine_params_for_orchestrator.get("engine_type", "")
|
||||
|
||||
# Flush strategy for long-context models: keep all text, only keep latest images
|
||||
if engine_type in ["anthropic", "openai", "gemini", "vllm"]:
|
||||
max_images = self.max_trajectory_length
|
||||
# for agent in [self.generator_agent, self.reflection_agent]:
|
||||
for agent in [self.orchestrator_agent]:
|
||||
if agent is None:
|
||||
continue
|
||||
# keep latest k images
|
||||
img_count = 0
|
||||
stop_idx = 1 if self.engine_params_for_orchestrator.get("keep_first_image", False) else -1
|
||||
for i in range(len(agent.messages) - 1, stop_idx, -1):
|
||||
# for j in range(len(agent.messages[i]["content"])):
|
||||
for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
|
||||
if "image" in agent.messages[i]["content"][j].get("type", ""):
|
||||
img_count += 1
|
||||
if img_count > max_images:
|
||||
del agent.messages[i]["content"][j]
|
||||
|
||||
# Flush strategy for non-long-context models: drop full turns
|
||||
else:
|
||||
# generator msgs are alternating [user, assistant], so 2 per round
|
||||
if len(self.orchestrator_agent.messages) > 2 * self.max_trajectory_length + 1:
|
||||
self.orchestrator_agent.messages.pop(1)
|
||||
self.orchestrator_agent.messages.pop(1)
|
||||
|
||||
|
||||
def generate_next_action(self, instruction: str, obs: Dict, is_last_step: bool) -> Tuple[Dict, List]:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
print("=" * 30, f"Turn {self.turn_count + 1}", "=" * 30)
|
||||
|
||||
print("=" * 10)
|
||||
print(instruction)
|
||||
print("=" * 10)
|
||||
|
||||
self.os_aci.assign_screenshot(obs)
|
||||
self.os_aci.set_task_instruction(instruction)
|
||||
|
||||
|
||||
generator_message = (
|
||||
""
|
||||
if self.turn_count > 0
|
||||
else "The initial screen is provided. No action has been taken yet."
|
||||
)
|
||||
|
||||
|
||||
# Load the task into the system prompt
|
||||
if is_last_step:
|
||||
# Eager mode: must decide done / fail
|
||||
prompt_with_instructions = PROCEDURAL_MEMORY.construct_eager_mode_procedural_memory(agent_class=type(self.os_aci)).replace(
|
||||
"TASK_DESCRIPTION", instruction
|
||||
).replace(
|
||||
"CURRENT_OS", self.platform
|
||||
)
|
||||
print(f'Eager Mode Started, Instruction: {prompt_with_instructions}')
|
||||
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
|
||||
generator_message += "Note: 'EAGER MODE' is enabled. You must determine whether the task is done or fail in this step!!!"
|
||||
else:
|
||||
tutorials = ""
|
||||
for idx, t in enumerate(self.os_aci.tutorials, start=1):
|
||||
tutorials += f"### Tutorial {idx}:\n {t}\n"
|
||||
|
||||
prompt_with_instructions = self.orchestrator_sys_prompt.replace(
|
||||
"TASK_DESCRIPTION", instruction
|
||||
).replace(
|
||||
"TUTORIAL_PLACEHOLDER", tutorials
|
||||
)
|
||||
|
||||
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
|
||||
|
||||
# print(self.orchestrator_agent.system_prompt)
|
||||
|
||||
### Reflection Part
|
||||
reflection_info = {}
|
||||
if self.enable_reflection:
|
||||
# set instruction to memory agent
|
||||
self.memoryer_agent.add_instruction(instruction)
|
||||
reflection = None
|
||||
# Differentiate the operation mode of last step
|
||||
last_code_summary = ""
|
||||
mode = "gui"
|
||||
if (
|
||||
hasattr(self.os_aci, "last_code_agent_result")
|
||||
and self.os_aci.last_code_agent_result is not None
|
||||
):
|
||||
# If code agent is called last step, we use its execution result as step behavior.
|
||||
code_result = self.os_aci.last_code_agent_result
|
||||
mode = "code"
|
||||
last_code_summary += f"Subtask Instruction: {code_result['task_instruction']}\nSteps Completed: {code_result['steps_executed']}\nCompletion Reason: {code_result['completion_reason']}\nExec Summary: {code_result['summary']}\n"
|
||||
|
||||
if (
|
||||
hasattr(self.os_aci, "last_search_agent_result")
|
||||
and self.os_aci.last_search_agent_result is not None
|
||||
):
|
||||
mode = "search"
|
||||
# retrieve reflection!!!
|
||||
reflection_info = self.memoryer_agent.get_reflection(
|
||||
cur_obs=obs,
|
||||
# only use the string after "(next action)" in orchestrator's output
|
||||
generator_output=parse_action_from_string(self.worker_history[-1]) if self.turn_count != 0 else "",
|
||||
coordinates=self.coords_history[-1] if self.turn_count != 0 else [],
|
||||
mode=mode,
|
||||
code_exec_summary=last_code_summary,
|
||||
action_dict=self.action_dict_history[-1] if self.turn_count != 0 else {}
|
||||
)
|
||||
reflection = reflection_info['reflection']
|
||||
logger.info(f'[Reflection]: {reflection}')
|
||||
if reflection:
|
||||
generator_message += f"REFLECTION: You MUST use this reflection on the latest action:\n{reflection}\n"
|
||||
else:
|
||||
generator_message += "You should go on with your plan.\n"
|
||||
else:
|
||||
generator_message += "You should go on with your plan.\n"
|
||||
|
||||
|
||||
# Add code agent result from previous step if available (from full task or subtask execution)
|
||||
if (
|
||||
hasattr(self.os_aci, "last_code_agent_result")
|
||||
and self.os_aci.last_code_agent_result is not None
|
||||
):
|
||||
code_result = self.os_aci.last_code_agent_result
|
||||
generator_message += f"\nCODE AGENT RESULT:\n"
|
||||
generator_message += (
|
||||
f"Task/Subtask Instruction: {code_result['task_instruction']}\n"
|
||||
)
|
||||
generator_message += f"Steps Completed: {code_result['steps_executed']}\n"
|
||||
generator_message += f"Max Steps: {code_result['budget']}\n"
|
||||
generator_message += (
|
||||
f"Completion Reason: {code_result['completion_reason']}\n"
|
||||
)
|
||||
generator_message += f"Summary: {code_result['summary']}\n"
|
||||
generator_message += "\n"
|
||||
# Reset the code agent result after adding it to context
|
||||
self.os_aci.last_code_agent_result = None
|
||||
|
||||
if (
|
||||
hasattr(self.os_aci, "last_search_agent_result")
|
||||
and self.os_aci.last_search_agent_result is not None
|
||||
):
|
||||
# Retrieve the result dictionary
|
||||
search_result = self.os_aci.last_search_agent_result
|
||||
|
||||
# Add a clear, distinct header for this section in the prompt
|
||||
generator_message += f"\nSEARCH AGENT RESULT:\n"
|
||||
|
||||
# Add contextual metadata from the search task
|
||||
generator_message += f"Search Query: {search_result['query']}\n"
|
||||
generator_message += f"Search Completion Reason: {search_result['completion_reason']}\n"
|
||||
generator_message += "Search Result: "
|
||||
# Add the most important part: the tutorial found by the agent.
|
||||
# This is given a prominent sub-header so the LLM knows to pay close attention.
|
||||
if search_result["completion_reason"] == "DONE":
|
||||
generator_message += f'Search is completed, the tutorial it found has been already added to your system prompt.\n'
|
||||
elif search_result["completion_reason"] == "FAIL":
|
||||
generator_message += f"Search is fail, the failure reason or the hint is as follow: {search_result['final_answer']}\n"
|
||||
|
||||
|
||||
# CRITICAL: Reset the search agent result after adding it to the context.
|
||||
# This prevents it from being added to the prompt again in the next turn.
|
||||
self.os_aci.last_search_agent_result = None
|
||||
|
||||
|
||||
# Finalize the generator message
|
||||
self.orchestrator_agent.add_message(
|
||||
generator_message, image_content=obs["screenshot"], role="user", put_text_last=True
|
||||
)
|
||||
|
||||
# Generate the plan and next action
|
||||
format_checkers = [
|
||||
SINGLE_ACTION_FORMATTER,
|
||||
partial(CODE_VALID_FORMATTER, self.tool_config),
|
||||
]
|
||||
plan = call_llm_formatted(
|
||||
self.orchestrator_agent,
|
||||
format_checkers,
|
||||
temperature=self.engine_params_for_orchestrator.get("temperture", 0.1),
|
||||
use_thinking=self.use_thinking,
|
||||
)
|
||||
self.worker_history.append(plan)
|
||||
self.orchestrator_agent.add_message(plan, role="assistant")
|
||||
logger.info("PLAN:\n %s", plan)
|
||||
|
||||
# Extract the next action from the plan
|
||||
# 此时的plan code e.g. agent.click('xxxxx', 1)
|
||||
plan_code = parse_code_from_string(plan)
|
||||
action_dict, coordinates = None, None
|
||||
try:
|
||||
assert plan_code, "Plan code should not be empty"
|
||||
# exec_code e.g. import pyautogui; pyautogui.click(1, 2);
|
||||
exec_code, action_dict = create_pyautogui_code(self.os_aci, plan_code, obs)
|
||||
coordinates = extract_coords_from_action_dict(action_dict)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
|
||||
)
|
||||
exec_code, action_dict = self.os_aci.wait(
|
||||
1.333
|
||||
) # Skip a turn if the code cannot be evaluated
|
||||
|
||||
self.action_dict_history.append(action_dict)
|
||||
|
||||
executor_info = {
|
||||
"refined_instruction": self.instruction,
|
||||
"plan": plan,
|
||||
"plan_code": plan_code,
|
||||
"exec_code": exec_code,
|
||||
"coordinates": coordinates,
|
||||
"reflection": reflection_info,
|
||||
"code_agent_output": (
|
||||
self.os_aci.last_code_agent_result
|
||||
if hasattr(self.os_aci, "last_code_agent_result")
|
||||
and self.os_aci.last_code_agent_result is not None
|
||||
else None
|
||||
),
|
||||
"search_agent_output": (
|
||||
self.os_aci.last_search_agent_result
|
||||
if hasattr(self.os_aci, "last_search_agent_result")
|
||||
and self.os_aci.last_search_agent_result is not None
|
||||
else None
|
||||
)
|
||||
}
|
||||
self.turn_count += 1
|
||||
self.coords_history.append(coordinates)
|
||||
self.flush_messages()
|
||||
return executor_info, [exec_code]
|
||||
Reference in New Issue
Block a user