add_os_symphony (#399)

This commit is contained in:
Bowen Yang
2025-12-23 14:30:44 +08:00
committed by GitHub
parent ac31778ee3
commit f593f35b1c
26 changed files with 6674 additions and 0 deletions

View File

View File

@@ -0,0 +1,350 @@
import logging
from typing import Dict, List, Tuple, Optional
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, parse_code_from_string
from mm_agents.os_symphony.core.mllm import LMMAgent
logger = logging.getLogger("desktopenv.coder_agent")
def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
"""Extract code and determine type from action string."""
if "```python" in action:
code_type = "python"
code = action.split("```python")[1].split("```")[0].strip()
elif "```bash" in action:
code_type = "bash"
code = action.split("```bash")[1].split("```")[0].strip()
elif "```" in action:
code_type = None
code = action.split("```")[1].split("```")[0].strip()
else:
code_type = None
code = None
logger.debug(
f"Extracted code block: type={code_type}, length={len(code) if code else 0}"
)
return code_type, code
def execute_code(code_type: str, code: str, env_controller) -> Dict:
"""Execute code based on its type."""
# Log the full code being executed (untruncated)
logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
try:
if code_type == "bash":
result = env_controller.run_bash_script(code, timeout=30)
elif code_type == "python":
result = env_controller.run_python_script(code)
else:
result = {"status": "error", "error": f"Unknown code type: {code_type}"}
return result
except Exception as e:
logger.error(f"Error executing {code_type} code: {e}")
return {"status": "error", "error": str(e)}
def format_result(result: Dict, step_count: int) -> str:
"""Format execution result into context string."""
if not result:
logger.warning(f"Step {step_count + 1}: No result returned from execution")
return f"""
Step {step_count + 1} Error:
Error: No result returned from execution
"""
status = result.get("status", "unknown")
return_code = result.get("returncode", result.get("return_code", -1))
# Handle different response structures for bash vs python
if "returncode" in result:
# Bash script response
output = result.get("output", "") # Contains both stdout and stderr merged
error = result.get("error", "") # Always empty for bash
else:
# Python script response
output = result.get("output", "") # stdout only
error = result.get("error", "") # stderr only
logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
# Format with better structure for multi-line outputs
result_text = f"Step {step_count + 1} Result:\n"
result_text += f"Status: {status}\n"
result_text += f"Return Code: {return_code}\n"
if output:
result_text += f"Output:\n{output}\n"
if error:
result_text += f"Error:\n{error}\n"
return result_text
class CoderAgent:
"""A dedicated agent for executing code with a budget of steps."""
def __init__(self, engine_params: Dict, client_password: str, platform: str = "linux"):
"""Initialize the CodeAgent."""
if not engine_params:
raise ValueError("engine_params cannot be None or empty")
self.engine_params = engine_params
self.budget = engine_params.get("budget", 20)
self.temperature = engine_params.get("temperature", 0.1)
self.agent = None
self.platform = platform
self.client_password = client_password
logger.info(f"CodeAgent initialized with budget={self.budget} and platform={self.platform}")
self.reset()
def reset(self):
"""Reset the code agent state."""
logger.debug("Resetting CodeAgent state")
self.agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.construct_coder_procedural_memory(platform=self.platform, client_password=self.client_password)
)
def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
"""Execute code for the given task with a budget of steps."""
if env_controller is None:
raise ValueError("env_controller is required for code execution")
print(f"\n🚀 STARTING CODE EXECUTION")
print("=" * 60)
print(f"Task: {task_instruction}")
print(f"Budget: {self.budget} steps")
print("=" * 60)
logger.info(f"Starting code execution for task: {task_instruction}")
logger.info(f"Budget: {self.budget} steps")
self.reset()
# Add initial task instruction and screenshot context as user message
context = (
f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
)
self.agent.add_message(context, image_content=screenshot, role="user")
step_count = 0
execution_history = []
execution_result_history = []
while step_count < self.budget:
logger.info(f"Step {step_count + 1}/{self.budget}")
# Get assistant response (thoughts and code)
response = call_llm_safe(self.agent, temperature=self.temperature)
# Print to terminal for immediate visibility
# print(f"\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}")
# print("=" * 60)
# print(response)
# print("=" * 60)
# Log the latest message from the coding agent (untruncated)
logger.info(
f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}"
)
# Check if response is None or empty
if not response or response.strip() == "":
error_msg = f"Step {step_count + 1}: LLM returned empty response"
logger.error(error_msg)
raise RuntimeError(error_msg)
# Parse the response to extract action
action = parse_code_from_string(response)
thoughts = response
execution_history.append(
{"step": step_count + 1, "action": action, "thoughts": thoughts}
)
# Check for completion signals
action_upper = action.upper().strip()
if action_upper == "DONE":
print(f"\n✅ TASK COMPLETED - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task completion")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task completed successfully")
completion_reason = "DONE"
break
elif action_upper == "FAIL":
print(f"\n❌ TASK FAILED - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task failure")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task failed by agent request")
completion_reason = "FAIL"
break
elif action_upper == 'INFEASIBLE':
print(f"\n❌ TASK INFEASIBLE - Step {step_count + 1}")
print("=" * 60)
print("Agent signaled task infeasible")
print("=" * 60)
logger.info(f"Step {step_count + 1}: Task infeasible by agent request")
completion_reason = "INFEASIBLE"
break
# Extract and execute code
code_type, code = extract_code_block(response.split("(Answer)")[-1])
if code:
result = execute_code(code_type, code, env_controller)
execution_result_history.append(
{"step": step_count + 1, "result": result}
)
# Prepare formatted output and error for logging
output = result.get("output", "")
error = result.get("error", "")
message = result.get("message", "")
status = result.get("status", "")
# Print execution result to terminal for immediate visibility
print(f"\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}")
print("-" * 50)
print(f"Status: {status}")
if output:
print(f"Output:\n{output}")
if error:
print(f"Error:\n{error}")
if message and not output and not error:
print(f"Message:\n{message}")
print("-" * 50)
log_lines = [
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
f"Status: {status}" if status else None,
]
if output:
log_lines.append(
"Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40)
)
if error:
log_lines.append(
"Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40)
)
if message and not output and not error:
log_lines.append(
"Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40)
)
# Remove None entries and join
formatted_log = "\n".join([line for line in log_lines if line])
logger.info(formatted_log)
else:
print(f"\n⚠️ NO CODE BLOCK FOUND - Step {step_count + 1}")
print("-" * 50)
print("Action did not contain executable code")
print("-" * 50)
logger.warning(f"Step {step_count + 1}: No code block found in action")
result = {"status": "skipped", "message": "No code block found"}
logger.info(
f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
f"Status: skipped\n"
f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
)
# Add assistant's thoughts and code to message history
self.agent.add_message(response, role="assistant")
# Process result and add formatted environment results as user message
result_context = format_result(result, step_count)
self.agent.add_message(result_context, role="user")
step_count += 1
# Handle budget exhaustion
if "completion_reason" not in locals():
print(f"\n⏰ BUDGET EXHAUSTED - {step_count} steps completed")
print("=" * 60)
print(f"Maximum budget of {self.budget} steps reached")
print("=" * 60)
logger.info(f"Budget exhausted after {step_count} steps")
completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
# Generate final summary
logger.info("Generating execution summary")
summary = self._generate_summary(execution_history, task_instruction)
result = {
"task_instruction": task_instruction,
"completion_reason": completion_reason,
"summary": summary,
"execution_history": execution_history,
"execution_result_history": execution_result_history,
"steps_executed": step_count,
"budget": self.budget
}
logger.info(f"Code execution completed: steps={step_count}")
return result
def _generate_summary(
self, execution_history: List[Dict], task_instruction: str
) -> str:
"""Generate summary of code execution session."""
if not execution_history:
logger.info("No execution history to summarize")
return "No actions were executed."
logger.info(f"Generated summary for {len(execution_history)} steps")
# Build detailed execution context for summary agent
execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
for step in execution_history:
step_num = step["step"]
thoughts = step.get("thoughts", "")
action = step.get("action", "")
execution_context += f"\nStep {step_num}:\n"
if thoughts:
execution_context += f"Thoughts: {thoughts}\n"
execution_context += f"Code: {action}\n"
# Create summary prompt with same context as coding agent
summary_prompt = f"""
{execution_context}
Please provide a concise summary of the code execution session. Focus on:
1. The code logic implemented at each step
2. The outputs and results produced by each code execution
3. The progression of the solution approach
Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
Keep the summary under 150 words and use clear, factual language.
"""
# Generate summary using LLM with dedicated summary system prompt
try:
summary_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,
)
summary_agent.add_message(summary_prompt, role="user")
summary = call_llm_safe(summary_agent, temperature=self.temperature)
if not summary or summary.strip() == "":
summary = "Summary generation failed - no response from LLM"
logger.warning("Summary generation failed - empty response from LLM")
except Exception as e:
summary = f"Summary generation failed: {str(e)}"
logger.error(f"Error generating summary: {e}")
return summary

View File

@@ -0,0 +1,109 @@
import re
from typing import Any, Dict, List
import pytesseract
from PIL import Image
import io
from mm_agents.os_symphony.core.mllm import LMMAgent
from mm_agents.os_symphony.utils.common_utils import call_llm_safe, smart_resize
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
import logging
logger = logging.getLogger("desktopenv.agent")
class GrounderAgent:
"""
Class designed for interacting with GUI, serving for Grounding Agent and VLMSearcher
"""
def __init__(self, engine_params: Dict, screen_width: int, screen_height: int):
self.engine_params_for_grounder = engine_params # grounder_params
system_prompt, self.user_message = PROCEDURAL_MEMORY.construct_grounder_procedural_memory(model_name=engine_params["model"])
self.grounding_model = LMMAgent(engine_params, system_prompt=system_prompt)
# Width and height for Grounding Agent!
self.width = engine_params['grounding_width']
self.height = engine_params['grounding_height']
print(f"[Grounder]: initialized width is {self.width}, height is {self.height}")
# Width and height for actual screen!
self.screen_width = screen_width
self.screen_height = screen_height
# Given the state and worker's referring expression, use the grounding model to generate (x,y)
def generate_coords(self, ref_expr: str, obs: Dict, detail=False, expansion_pixels=400, **kwargs) -> List:
cur_screenshot = obs["screenshot"]
# store global offset
global_offset_x = 0
global_offset_y = 0
# final coordinates for output
final_global_x = 0
final_global_y = 0
cur_width, cur_height = self.screen_width, self.screen_height
print(f"[Grounder] start to ground!")
self.grounding_model.reset()
# Configure the context
prompt = self.user_message.replace("REF_EXPR", ref_expr)
# cosistent with the system prompt presented in the paper of GTA-1
if 'gta' in self.engine_params_for_grounder['model']:
self.grounding_model.add_system_prompt("You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.")
self.grounding_model.add_message(
text_content=prompt, image_content=cur_screenshot, put_text_last=True, role="user"
)
# Generate and parse coordinates
response = call_llm_safe(self.grounding_model, temperature=0.05, **kwargs)
print(f"[Grounder] prompt: {prompt}\nmodel: {self.engine_params_for_grounder['model']}, \nresponse: {response}")
# 1. highest priority: (x1="...", y1="...", x="...", y="...")
numericals = re.findall(r'(?:x1|y1|x|y)=["\']?(\d+)["\']?', response)
# 2. second highest priority: just like <points>653 42</points> or [653, 42]
if len(numericals) < 2:
clean_response = re.sub(r'[xXyY]\d', '', response)
numericals = re.findall(r'\d+', clean_response)
assert len(numericals) >= 2
print(f"[Grounder] the parsed coordinates: {numericals}")
local_x, local_y = self._resize_coordinates([int(numericals[0]), int(numericals[1])], width=cur_width, height=cur_height)
# current global coordinates = local ordinates + global offset
final_global_x = local_x + global_offset_x
final_global_y = local_y + global_offset_y
if detail:
return [cur_screenshot, global_offset_x, global_offset_y]
else:
return [final_global_x, final_global_y]
def dynamic_set_width_height(self, width: int, height: int):
self.width = width
self.height = height
# Resize from grounding model dim into OSWorld dim (1920 * 1080)
def _resize_coordinates(self, coordinates: List[int], width:int, height:int) -> List[int]:
"""
width, height: for current observation
grounding_width, grounding_height: width and height for Grounding model 1000x1000 or 1280x800)
"""
grounding_width = self.engine_params_for_grounder["grounding_width"]
grounding_height = self.engine_params_for_grounder["grounding_height"]
grounding_smart_resize = self.engine_params_for_grounder["grounding_smart_resize"]
if not grounding_smart_resize:
return [
round(coordinates[0] * width / grounding_width),
round(coordinates[1] * height / grounding_height),
]
else:
smart_height, smart_width = smart_resize(height, width)
return [
round(coordinates[0] * width / smart_width),
round(coordinates[1] * height / smart_height)
]

View File

@@ -0,0 +1,428 @@
from ast import parse
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
from mm_agents.os_symphony.utils.common_utils import (
call_llm_formatted,
enhance_observation,
parse_code_from_string
)
from functools import partial
from mm_agents.os_symphony.utils.formatters import JSON_ANSWER_FORMATTER
from mm_agents.os_symphony.core.mllm import LMMAgent
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
import imagehash
import io
import os
from PIL import Image
import numpy as np
from skimage.metrics import structural_similarity as ssim
logger = logging.getLogger("desktopenv.agent")
class StepBehavior:
"""
Narrative Step Behavior.
Description of each step, cosists of generative agent (main agent)'s output, screenshot (if this step is milestone), and textual description.
The textual description shows that how the agent thought and did, and how the state changes.
"""
def __init__(self, is_milestone: bool, gen_output: str, summary: str, obs: Dict, action_dict: Dict):
self.is_milestone = is_milestone
self.gen_output = gen_output
self.obs = obs
self.summary = summary
self.action_dict = action_dict
# Variants for opyimizing the time complexity of loop detection
# --- 1. pHash ---
self.phash = None
# --- 2. SSIM ---
self.ssim_list = []
def _update_phash_ssim(self, history: List):
# Calculate the ssim_list of current obs
# Update pHash
cur_img = Image.open(io.BytesIO(self.obs["screenshot"]))
cur_img_gray = cur_img.convert('L')
cur_img_np = np.array(cur_img_gray)
self.phash = imagehash.phash(cur_img)
# Update ssim_list
for hs in history:
compare_img = Image.open(io.BytesIO(hs.obs["screenshot"]))
compare_img_gray = compare_img.convert('L')
compare_img_np = np.array(compare_img_gray)
self.ssim_list.append(ssim(cur_img_np, compare_img_np, data_range=cur_img_np.max() - compare_img_np.min()))
class ReflectionMemoryAgent:
"""
Reflection Memory Agent (RMA).
Responsible for maintaining long-term memory, extracting narratives from trajectories,
providing reflections to the Main Agent, and validating task completion status.
"""
def __init__(self, engine_params: Dict):
"""
Initialize the RMA.
Args:
- engine_params:
{
"engine_type": args.provider,
"model": args.model,
"base_url": args.model_url,
"api_key": args.model_api_key,
"temperature": getattr(args, "model_temperature", None),
}
- max_img_len: max image number to use in reflection process
"""
self.engine_params = engine_params
self.max_images = engine_params.get('max_images', 8)
self.memoryer_level = engine_params.get('memoryer_level', 3)
self.reset()
logger.info(f"ReflectionMemoryAgent initialized with:\n {self.engine_params}")
def reset(self):
"""Reset the code agent state."""
logger.debug("Resetting RMA state")
self.instruction = None
self.trajectory: List[StepBehavior] = []
self.knowledge_base: List[str] = []
self.last_code_step_idx = -1
'''
Control the count of images, we only use the maximum number of max_img_len images.
The update logic: the 0-th screenshot is always retained. If the total number of screenshots is less than max_img_len, all are kept; otherwise, starting from index 1, milestone screenshots are managed via FIFO.
'''
self.active_img_idx = []
self.reflection_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT,
)
self.behavior_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.SUMMARIZE_STEP_SYSTEM_PROMPT
)
def add_instruction(self, instruction):
"""
[Interface] Main -> RMA
Main agent set the instruction to RMA.
"""
self.instruction = instruction
def _update_trajectory(self, step_behavior):
self.trajectory.append(step_behavior)
if len(self.active_img_idx) >= self.max_images:
if step_behavior.is_milestone:
self.active_img_idx.append(len(self.trajectory) - 1) # over max_img_lenonly milestone image
del self.active_img_idx[1] # FIFO starts from index 1
else:
self.active_img_idx.append(len(self.trajectory) - 1) # less than max_img_len, feed all images
assert len(self.active_img_idx) <= self.max_images, "[RMA] Errors in updating StepBehavior!!"
def _summarize_step_behavior(
self,
generator_output: str,
cur_obs: Dict,
enhanced_obs: bytes | None,
is_milestone: bool,
mode: str = "gui",
code_exec_summary: str = "",
action_dict: Dict = {}
) -> Tuple[StepBehavior, bool]:
"""
[Interface] Main -> RMA
The Main Agent (MA) calls this method to "feed" the information of the just-completed step to the RMA.
RMA will internally process and store this step.
"""
if mode == "search":
is_success = "successful"
# summary is fixed
step_behavior = StepBehavior(
False,
generator_output,
"Search Agent was called last step, and a tutorial has been generated.",
cur_obs,
action_dict
)
elif mode == "code":
self.last_code_step_idx = len(self.trajectory)
is_success = "successful"
# the summary returned by the code agent
step_behavior = StepBehavior(
False,
generator_output,
f"Code Agent was called last step, and the summary of its trajectory is: \n---\n{code_exec_summary}\n---",
cur_obs,
action_dict
)
else: # common gui operation, use LLM to summarize
prev_obs = self.trajectory[-1].obs
text_content = f"""Computer Use Agent's Output: \n{generator_output}"""
self.behavior_agent.reset() # don't need history messages
updated_sys_prompt = (
self.behavior_agent.system_prompt + "\n" + text_content
)
self.behavior_agent.add_system_prompt(updated_sys_prompt)
self.behavior_agent.add_message(
text_content="This is the observation before executing action (attached below).",
image_content=prev_obs['screenshot'],
role="user",
put_text_last=False
)
self.behavior_agent.add_message(
text_content="This is the zoom-in view, which may help you to identify the operational region (attached below).",
image_content=enhanced_obs,
role="user",
put_text_last=False
)
self.behavior_agent.add_message(
text_content="This is the observation after executing action (attached below).",
image_content=cur_obs['screenshot'],
role="user",
put_text_last=False
)
required_fields = ["summary", "evaluation"]
format_checkers = [
partial(JSON_ANSWER_FORMATTER, required_fields)
]
full_response = call_llm_formatted(
self.behavior_agent,
format_checkers,
temperature=self.engine_params.get("temperture", 0.1),
)
response = parse_code_from_string(full_response)
try:
data = json.loads(response)
behavior_summary = data['summary']
is_success = data["evaluation"]
except Exception as e:
print("[RMA] Errors in generating step summary: ", e)
logger.info("Response is not a JSON object or miss required keys!")
behavior_summary = response
is_success = "successful"
step_behavior = StepBehavior(is_milestone, generator_output, behavior_summary, cur_obs, action_dict)
return step_behavior, is_success == "successful"
def get_reflection(
self,
cur_obs: Dict,
generator_output: str,
coordinates: List,
mode: str="gui",
code_exec_summary: str = "",
action_dict: Dict = {}
) -> Dict:
"""
[Interface] RMA -> Main
The Main Agent (MA) calls this method to get RMA's reflection before deciding the next action.
Args:
- cur_obs (Dict): The Main Agent's current observation (o_k).
- generator_output (str): The thoughts, screen analysis and action of Main Agent.
- coordinates (List): coordinates in the last operation step of Main Agent.
- mode(str): [gui, code, search]. Indicate which agent that main agent called last step.
- code_exec_summary: execution summary for code agent.
- action_dict: extracted action from generator output.
Returns:
- reflection_info(Dict): all the info related to reflection
"""
if self.memoryer_level == 0:
return {
"reflection": None,
"reflection_thoughts": None,
"existing_knowledge": None,
"is_milestone": False,
"new_knowledge": None,
"step_summary": None,
"hint": {
"gui_operation_error": False,
"lack_of_tutorial": False,
"code_error": False,
"loop_detection": None,
}
}
reflection = None
reflection_thought = None
if len(self.trajectory) == 0:
step_behavior = StepBehavior(
True,
"The initial screen is provided. No action has been taken yet.",
"The initial screen is provided. No action has been taken yet.",
cur_obs,
action_dict
)
step_behavior._update_phash_ssim(self.trajectory)
self._update_trajectory(step_behavior)
reflection_info = {
"reflection": reflection,
"reflection_thoughts": reflection_thought,
"existing_knowledge": "\n".join(self.knowledge_base),
"is_milestone": True,
"new_knowledge": "",
"step_summary": "",
"loop_detection": None
}
else:
### Step Summary
prev_obs = self.trajectory[-1].obs
enhanced_obs = None
if coordinates:
enhanced_obs, _, _, _, _ = enhance_observation(
prev_obs["screenshot"],
coordinates,
draw=True
)
# generate step behavior
step_behavior, last_gui_check = self._summarize_step_behavior(
generator_output,
cur_obs,
enhanced_obs,
False,
mode,
code_exec_summary,
action_dict
)
step_behavior._update_phash_ssim(self.trajectory)
### make additional hints
additional_hints = []
if not last_gui_check:
additional_hints.append(f"\t- Warning: The last GUI operation is unsuccessful. Careful review is required to avoid GUI Operation Error.")
code_error_hint = False
if self.last_code_step_idx != -1 and len(self.trajectory) - self.last_code_step_idx < 0:
code_error_hint = True
additional_hints.append(f"\t- Warning: The Computer Use Agent might in the verification stage of Code Agent. Careful review is required to avoid Code Error.")
# loop detection
from mm_agents.os_symphony.utils.loop_detection import detect_loop
is_loop, loop_details = detect_loop(full_trajectory=self.trajectory, N=3)
if is_loop and loop_details:
match_sequence_indices = loop_details["match_sequence_indices"]
loop_hint_message = f"\t- Warning: A potential LOOP has been detected between Step {match_sequence_indices[0]} and Step {match_sequence_indices[-1]}. Careful review is required to avoid Repetitive Behavior Error."
additional_hints.append(loop_hint_message)
self.reflection_agent.reset()
updated_sys_prompt = (
PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT + "\n\n" +
f"---\n- **user instruction**: {self.instruction}\n" +
"- **existing knowledge**: \n" + "\n".join(self.knowledge_base) +
"\n- **additional_hints**: " + "\n".join(additional_hints) + "\n---"
)
# update system prompt
self.reflection_agent.add_system_prompt(updated_sys_prompt)
for i, step in enumerate(self.trajectory):
text_content = f"""### (Step {i}) history:\nsummary: '''\n{step.summary}\n'''"""
if i in self.active_img_idx:
if i == 0:
text_content += f"\ninitial screenshot:"
else:
text_content += f"\nscreenshot (after executing action): (attached below)"
self.reflection_agent.add_message(
text_content=text_content,
image_content=step.obs['screenshot'] if i in self.active_img_idx else None,
role="user",
put_text_last=False
)
text_content = f"""### (Last Step) CUA's output (has been finished):\n---\n{generator_output}\n---\nStep Summary:\n---\n{step_behavior.summary}\n---\nlatest_screenshot: (attached below)"""
self.reflection_agent.add_message(
text_content=text_content,
image_content=cur_obs['screenshot'],
role="user",
put_text_last=False
)
required_fields = ["is_milestone", "reflection", "knowledge"]
format_checkers = [
partial(JSON_ANSWER_FORMATTER, required_fields)
]
full_response = call_llm_formatted(
self.reflection_agent,
format_checkers
)
reflection_thought = full_response
response = parse_code_from_string(full_response)
try:
data = json.loads(response)
reflection = data['reflection']
is_milestone = data["is_milestone"]
knowledge = data['knowledge']
except Exception as e:
print("[RMA] Errors in dealing with reflection: ", e)
logger.info("Response is not a JSON object or miss required keys!")
reflection = response
is_milestone = False
knowledge = ""
if len(knowledge) > 0:
self.knowledge_base.append(knowledge)
if isinstance(is_milestone, str):
is_milestone = True if "true" in is_milestone.lower() else False
# update trajectory and is_milestone
self._update_trajectory(step_behavior)
if mode == "gui": # only gui opration can be considered as milestone
self.trajectory[-1].is_milestone = is_milestone
reflection_info = {
"reflection": reflection,
"reflection_thoughts": reflection_thought,
"existing_knowledge": "\n".join(self.knowledge_base),
"is_milestone": is_milestone,
"new_knowledge": knowledge,
"step_summary": step_behavior.summary,
"hint": {
"gui_operation_error": not last_gui_check,
"lack_of_tutorial": is_loop,
"code_error": code_error_hint,
"loop_detection": loop_details,
}
}
return reflection_info

View File

@@ -0,0 +1,178 @@
import re
from io import BytesIO
from typing import Tuple, List, Dict
from PIL import Image, ImageDraw, ImageFont
import numpy as np
import pytesseract
from pytesseract import Output
import easyocr
class OCRProcessor:
"""
OCR Processor supports Tesseract and EasyOCR
"""
def __init__(self, use_gpu: bool = False, languages: List[str] = ['en']):
"""
Initialize processor
Args:
use_gpu (bool): whether EasyOCR need to use gpu
languages (List[str]): language list that EasyOCR, e.g. ['en', 'ch_sim']。
"""
self.use_gpu = use_gpu
self.languages = languages
self.reader = None # lazy-load EasyOCR Reader
def _get_easyocr_reader(self):
if self.reader is None:
print(f"Loading EasyOCR model (GPU={self.use_gpu})...")
self.reader = easyocr.Reader(self.languages, gpu=self.use_gpu)
return self.reader
def get_ocr_elements(self, bytes_image_data: bytes, mode: str = 'tesseract') -> Tuple[str, List[Dict]]:
"""
Executes OCR recognization.
Args:
bytes_image_data (str): image in Base64
mode (str): 'tesseract' (faster) or 'easyocr' (more precise)。
Returns:
Tuple[str, List]: (textual table string, list of element details)
"""
try:
image = Image.open(BytesIO(bytes_image_data))
except Exception as e:
print(f"Error decoding or opening image: {e}")
return "", []
if mode == 'tesseract':
return self._process_tesseract(image)
elif mode == 'easyocr':
return self._process_easyocr(image)
else:
raise ValueError(f"Unknown mode: {mode}. Use 'tesseract' or 'easyocr'.")
def _process_tesseract(self, image: Image.Image) -> Tuple[str, List[Dict]]:
"""Tesseract processing"""
data = pytesseract.image_to_data(image, output_type=Output.DICT)
ocr_elements = []
ocr_table = "Text Table (Tesseract):\nWord id\tText\n"
ocr_id = 0
num_boxes = len(data['text'])
for i in range(num_boxes):
# filter text with low confidence
if int(data['conf'][i]) > 0 and data['text'][i].strip():
clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", data['text'][i])
if not clean_text: continue
ocr_table += f"{ocr_id}\t{clean_text}\n"
ocr_elements.append({
"id": ocr_id,
"text": clean_text,
"mode": "tesseract",
"left": data["left"][i],
"top": data["top"][i],
"width": data["width"][i],
"height": data["height"][i],
"conf": data["conf"][i]
})
ocr_id += 1
return ocr_table, ocr_elements
def _process_easyocr(self, image: Image.Image) -> Tuple[str, List[Dict]]:
"""EasyOCR processing"""
reader = self._get_easyocr_reader()
image_np = np.array(image)
# detail=1 means returning (bbox, text, conf)
results = reader.readtext(image_np, detail=1, paragraph=False, width_ths=0.1)
ocr_elements = []
ocr_table = "Text Table (EasyOCR):\nWord id\tText\n"
ocr_id = 0
for (bbox, text, conf) in results:
clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", text)
if not clean_text.strip(): continue
# EasyOCR returns [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
# we convert them into left, top, width, height
(tl, tr, br, bl) = bbox
tl = [int(v) for v in tl]
br = [int(v) for v in br]
left = min(tl[0], bl[0])
top = min(tl[1], tr[1])
right = max(tr[0], br[0])
bottom = max(bl[1], br[1])
width = right - left
height = bottom - top
# ---------------
ocr_table += f"{ocr_id}\t{clean_text}\n"
ocr_elements.append({
"id": ocr_id,
"text": clean_text,
"mode": "easyocr",
"left": left,
"top": top,
"width": width,
"height": height,
"conf": float(conf)
})
ocr_id += 1
return ocr_table, ocr_elements
@staticmethod
def visualize_ocr_results(image_path: str, ocr_elements: List[Dict], output_path: str):
"""
Draw bounding boxes and IDs on the original image.
"""
try:
image = Image.open(image_path).convert("RGB")
draw = ImageDraw.Draw(image)
try:
font = ImageFont.truetype("arial.ttf", 16)
except IOError:
font = ImageFont.load_default()
for element in ocr_elements:
left, top = element["left"], element["top"]
width, height = element["width"], element["height"]
color = "green" if element.get("mode") == "easyocr" else "red"
draw.rectangle([(left, top), (left + width, top + height)], outline=color, width=2)
text_str = str(element["id"])
if hasattr(draw, "textbbox"):
bbox = draw.textbbox((0, 0), text_str, font=font)
text_w, text_h = bbox[2]-bbox[0], bbox[3]-bbox[1]
else:
text_w, text_h = draw.textsize(text_str, font=font)
label_bg = [left, top - text_h - 4, left + text_w + 4, top]
draw.rectangle(label_bg, fill=color)
draw.text((left + 2, top - text_h - 4), text_str, fill="white", font=font)
image.save(output_path)
print(f"Visualization saved to: {output_path}")
except FileNotFoundError:
print(f"Error: Image {image_path} not found.")
except Exception as e:
print(f"Visualization error: {e}")

View File

@@ -0,0 +1,575 @@
import re
from typing import Any, Dict, List, Optional, Tuple
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
from mm_agents.os_symphony.core.mllm import LMMAgent
from mm_agents.os_symphony.utils.common_utils import call_llm_safe
from mm_agents.os_symphony.agents.coder_agent import CoderAgent
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
from mm_agents.os_symphony.agents.searcher_agent import SearcherAgent
import logging
from mm_agents.os_symphony.agents.ocr import OCRProcessor
logger = logging.getLogger("desktopenv.agent")
# Agent action decorator
def agent_action(func):
func.is_agent_action = True
return func
# GrounderAgent primitives are parameterized by description, and coordinate generation uses a pretrained grounding model
class OSACI:
def __init__(
self,
env,
search_env,
platform: str,
client_password: str,
engine_params_for_ocr: Dict,
engine_params_for_grounder: Dict,
engine_params_for_coder: Dict,
engine_params_for_searcher: Dict,
screen_width: int = 1920,
screen_height: int = 1080
):
self.env = env
self.platform = platform
self.client_password = client_password
self.result_dir = ""
self.grounder_agent = GrounderAgent(engine_params=engine_params_for_grounder, screen_width=screen_width, screen_height=screen_height)
# Configure text grounding agent
self.ocr_processor = OCRProcessor()
self.text_span_agent = LMMAgent(
engine_params=engine_params_for_ocr,
system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,
)
# Configure code agent
self.coder_agent = CoderAgent(
engine_params=engine_params_for_coder,
platform=self.platform,
client_password=client_password
)
# Configure search agent
self.searcher_agent = SearcherAgent.create(
engine_params=engine_params_for_searcher,
search_env=search_env,
grounder_agent=self.grounder_agent,
platform=self.platform,
client_password=self.client_password
)
# Store task instruction for code agent
self.current_task_instruction = None
self.last_code_agent_result = None
self.last_search_agent_result = None
self.notes: List[str] = []
# Tutorial should be a global info, not a local context, so how to add it to the global info
self.tutorials = []
def assign_screenshot(self, obs):
self.obs = obs
# Given the state and worker's text phrase, generate the coords of the first/last word in the phrase
def generate_text_coords(
self, phrase: str, obs: Dict, alignment: str = ""
) -> List[int]:
screenshot, global_offset_x, global_offset_y= obs["screenshot"], 0, 0
ocr_table, ocr_elements = self.ocr_processor.get_ocr_elements(screenshot, "easyocr")
alignment_prompt = ""
if alignment == "start":
alignment_prompt = "**Important**: Output the word id of the FIRST word in the provided phrase.\n"
elif alignment == "end":
alignment_prompt = "**Important**: Output the word id of the LAST word in the provided phrase.\n"
# Load LLM prompt
self.text_span_agent.reset()
self.text_span_agent.add_message(
alignment_prompt + "Phrase: " + phrase + "\n" + ocr_table, role="user"
)
self.text_span_agent.add_message(
"Screenshot:\n", image_content=screenshot, role="user"
)
# Obtain the target element
response = call_llm_safe(self.text_span_agent)
print("TEXT SPAN AGENT RESPONSE:", response)
numericals = re.findall(r"\d+", response)
if len(numericals) > 0:
text_id = int(numericals[-1])
else:
text_id = 0
elem = ocr_elements[text_id]
# Compute the element coordinates
# Note: 0.1 * elem["height"] is used to adjust coordinates to select the last character more precisely.
if alignment == "start":
coords = [elem["left"], elem["top"] + (elem["height"] // 2)]
elif alignment == "end":
coords = [elem["left"] + elem["width"] + 0.15 * elem["height"], elem["top"] + (elem["height"] // 2)]
print(f'[OCR] output coordinates: {[coords[0] + global_offset_x, coords[1] + global_offset_y]}')
return [int(coords[0] + global_offset_x), int(coords[1] + global_offset_y)]
def set_task_instruction(self, task_instruction: str):
"""Set the current task instruction for the code agent."""
self.current_task_instruction = task_instruction
@agent_action
def click(
self,
element_description: str,
num_clicks: int = 1,
button_type: str = "left",
hold_keys: List = []
):
"""Click on the element
Args:
element_description:str, a detailed descriptions of which element to click on. This description needs to be VERY unambiguous. If the page contains many similar elements, ensure the description uniquely identifies the target element.
num_clicks:int, number of times to click the element
button_type:str, which mouse button to press can be "left", "middle", or "right"
hold_keys:List, list of keys to hold while clicking
"""
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
command = "import pyautogui; "
for k in hold_keys:
command += f"pyautogui.keyDown({repr(k)}); "
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
for k in hold_keys:
command += f"pyautogui.keyUp({repr(k)}); "
# Return pyautoguicode to click on the element
action = {"function": "click", "args": {"x": x, "y": y, "button": button_type, "clicks": num_clicks}}
return (command, action)
@agent_action
def open(self, app_or_filename: str):
"""Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
Args:
app_or_filename:str, the name of the application or filename to open
**Important**:
Provide only the name of the application or file. Do not include the full path (e.g., "/home/user/Desktop/my_report.docx"). The function works by searching for the name, not by accessing a file path directly.
"""
action = {"function": "open", "args": {"name": app_or_filename}}
if self.platform == "linux":
return (f"import pyautogui; pyautogui.hotkey('win'); time.sleep(1.0); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(1.0)", action)
elif self.platform == "macos":
return (f"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)", action)
elif self.platform == "windows":
return (f"import pyautogui; import time; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.press('enter'); time.sleep(0.5)", action)
else:
assert (
False
), f"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows."
def _paste(self, is_terminal):
if self.platform == 'macos':
return "pyautogui.hotkey('command', 'v');"
elif self.platform == 'linux':
if is_terminal:
return "pyautogui.hotkey('ctrl', 'shift', 'v');"
else:
return "pyautogui.hotkey('ctrl', 'v');"
elif self.platform == 'windows':
return "pyautogui.hotkey('ctrl', 'v');"
return ""
def _clear_all(self, is_terminal):
"""
Clean the content of current line
"""
# common apps in GUI
if not is_terminal:
if self.platform == 'macos':
# macOS GUI: Command + A -> Backspace
return "pyautogui.hotkey('command', 'a'); pyautogui.press('backspace');"
else:
# Windows/Linux GUI: Ctrl + A -> Backspace
return "pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace');"
# terminal
else:
if self.platform == 'windows':
return "pyautogui.press('esc');"
else:
return "pyautogui.hotkey('ctrl', 'e'); pyautogui.hotkey('ctrl', 'u');"
def _type(
self,
text: str,
is_terminal: bool
):
"""
use copy and paste to input Chinese, otherwise type normally
"""
commands = ""
has_unicode = any(ord(char) > 127 for char in text)
if has_unicode and self.platform != "macos":
commands += (
"original_clipboard = pyperclip.paste();"
f"pyperclip.copy({repr(text)});"
"time.sleep(0.1);"
)
commands += self._paste(is_terminal=is_terminal)
commands += "pyperclip.copy(original_clipboard);"
else:
commands += f"pyautogui.write({repr(text)}, interval=0.1);"
return commands
@agent_action
def type(
self,
element_description: str,
text: str = "",
overwrite: bool = False,
enter: bool = False,
is_terminal = False
):
"""Type text/unicode into a specific element
Args:
element_description: str, a detailed description of which element to enter text in. If provided, the agent will click on this element before typing.
text:str, the text to type
overwrite:bool, Default is False, assign it to True if the text should overwrite the whole existing text. Using this argument clears all text in an element.
enter:bool, Assign it to True if the enter key should be pressed after typing all the text, otherwise assign it to False.
is_terminal:bool, (MANDATORY) You MUST set this to True whenever the target you will type into is a terminal.
"""
commands = (
"import os;"
"import pyautogui;"
"import pyperclip;"
"import subprocess;"
"import time;"
)
if self.platform == "linux":
commands += (
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
)
x, y = None, None
if element_description is not None:
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
commands += (
f"pyautogui.click({x}, {y}, clicks=2);"
f"time.sleep(1.0);"
f"pyautogui.click({x}, {y});"
)
if overwrite:
commands += self._clear_all(is_terminal=is_terminal)
commands += self._type(text=text, is_terminal=is_terminal)
if enter:
commands += "pyautogui.press('enter');"
if element_description is not None:
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
else:
action = {"function": "type", "args": {"text": text}}
return (commands, action)
@agent_action
def drag_and_drop(
self, starting_description: str, ending_description: str, hold_keys: List = []
):
"""Drag from the starting description to the ending description
Args:
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
hold_keys:List list of keys to hold while dragging
"""
x1, y1 = self.grounder_agent.generate_coords(starting_description, self.obs)
x2, y2 = self.grounder_agent.generate_coords(ending_description, self.obs)
command = "import pyautogui; "
command += f"pyautogui.moveTo({x1}, {y1}); "
# TODO: specified duration?
for k in hold_keys:
command += f"pyautogui.keyDown({repr(k)}); "
command += f"pyautogui.dragTo({x2}, {y2}, duration=3., button='left'); pyautogui.mouseUp(); "
for k in hold_keys:
command += f"pyautogui.keyUp({repr(k)}); "
# Return pyautoguicode to drag and drop the elements
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
return (command, action)
@agent_action
def highlight_text_span(
self,
starting_phrase: str,
ending_phrase: str,
button: str = "left",
text: Optional[str|None] = None
):
"""Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.
Args:
starting_phrase: str, the sequence of words that marks the beginning of the text span. Provide a unique sequence of 5 to 10 words.
ending_phrase: str, the sequence of words that marks the end of the text span. Provide a unique sequence of 5 to 10 words.
button:str, the button to use to highlight the text span. Defaults to "left". Can be "left", "right", or "middle".
text: str | None, The text to overwrite the highlighted span with. Providing text here ensures the replacement happens immediately after selection, preventing focus loss.
"""
x1, y1 = self.generate_text_coords(
starting_phrase, self.obs, alignment="start"
)
x2, y2 = self.generate_text_coords(
ending_phrase, self.obs, alignment="end"
)
command = "import pyautogui; import time;"
command += f"pyautogui.moveTo({x1}, {y1}); "
# Click in advance to simulate selecting the text box.
command += (
f"pyautogui.click({x1}, {y1}, clicks=2);"
f"time.sleep(1.0); pyautogui.click({x1}, {y1}); time.sleep(1.0);"
)
command += f"pyautogui.dragTo({x2}, {y2}, duration=5., button='{button}'); time.sleep(0.5); pyautogui.mouseUp(); "
if text:
if self.platform == "linux":
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
command += (
"original_clipboard = pyperclip.paste();"
f"pyperclip.copy({repr(text)});"
)
command += self._paste(is_terminal=False)
command += "pyperclip.copy(original_clipboard);"
# Return pyautoguicode to drag and drop the elements
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
return (command, action)
@agent_action
def locate_cursor(
self,
phrase: str,
start_or_end: str="start",
text: Optional[str|None] = None
):
"""Click at the beginning or end of a specific text phrase to precisely control cursor positioning. Please prefer using the "click" action in general situations, and use this action only in text-intensive software such as libreoffice_writer, impress, etc.
Args:
phrase: str, The text phrase where you want to position the cursor. Provide a unique sequence of 5 to 10 words. Do NOT use single words unless the total text is extremely short.
start_or_end: str, Whether to click at the "start" (beginning) or "end" (trailing edge) of the identified text phrase. Use "start" to position before the text, "end" to position after it.
text: str | None, The text to enter immediately after positioning the cursor. Use this parameter instead of a separate 'type' action to ensure precise input.
"""
x, y = self.generate_text_coords(
phrase, self.obs, alignment=start_or_end
)
command = (
"import pyautogui;"
"import time;"
"import subprocess;"
"import pyperclip;"
f"pyautogui.click({x}, {y}, button='left', clicks=2);"
"time.sleep(1.0);"
f"pyautogui.click({x}, {y}, button='left');"
)
if text:
if self.platform == "linux":
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
command += self._type(text=text, is_terminal=False)
if text:
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
else:
action = {"function": "click", "args": {"x": x, "y": y, "clicks": 1, "button": "left"}}
return (command, action)
@agent_action
def call_code_agent(self, task: str):
"""Calls the code agent to execute a well-defined, self-contained goal that can be completed with code.
Args:
task: str, A specific, self-contained goal that the code agent can work on until completion.
**🚨 CRITICAL GUIDELINES:**
**Decompose the Main Objective into Logical Goals:**
- You **MUST** break down the overall mission into distinct, logical goals or stages.
- Your role is to define *what* needs to be done for a specific stage. The code agent's role is to figure out *how* to do it with code.
- Pass only one logical goal at a time. The `task` parameter is **REQUIRED**.
**Define a Self-Contained, Continuous Goal:**
- The `task` you provide should be a single, continuous goal. The code agent is capable of handling a multi-step process internally (e.g., opening a file, processing its data, and then saving it) to achieve this one goal.
- **Crucially, do not pass a task that combines multiple distinct objectives.** For example, instead of passing "Analyze the sales data, AND email the result," you should first pass the self-contained goal: "Analyze the sales data." After that goal is complete, you can proceed with the next logical goal (e.g., emailing the result) in a subsequent step.
- **If unsure, err on the side of caution.** If a task feels like it has two separate parts, break it down and pass only the first part.
- Your instruction must describe the desired end-state, NOT the recipe to get there. Do not specify any solution!
**Goal Purity is Essential:**
- **NEVER** rephrase, paraphrase, or modify the subtask instruction you have decided on. Pass the exact, original wording of the subtask to prevent instruction drift and hallucination.
Use this for tasks that can be fully accomplished through code execution, particularly for:
- Spreadsheet applications: data processing, filtering, sorting, calculations, formulas, data analysis
- Document editors: text processing, content editing, formatting, document manipulation
- Code editors: code editing, file processing, text manipulation, configuration
- Data analysis tools: statistical analysis, data transformation, reporting
- File management: bulk operations, file processing, content extraction
- System utilities: configuration, setup, automation
"""
logger.info("=" * 50)
logger.info("ACI: Calling Code Agent")
logger.info("=" * 50)
task_to_execute = task
logger.info(f"Executing SUBTASK: {task_to_execute}")
print("obs keys: ", self.obs.keys())
screenshot = self.obs.get("screenshot", "") if self.obs else ""
logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}")
logger.info("Executing code agent...")
result = self.coder_agent.execute(
task_to_execute, screenshot, self.env.controller
)
# Store the result for the worker to access
self.last_code_agent_result = result
logger.info("Code agent execution completed")
logger.info(f"Result - Completion reason: {result['completion_reason']}")
logger.info(f"Steps executed: {result['steps_executed']}")
logger.info(f"Summary: {result['summary']}")
logger.info("=" * 50)
logger.info("GROUNDING AGENT: Code Agent Call Finished")
logger.info("=" * 50)
action = {"function": "call_code_agent", "args": {"query": task, "result": True if result["completion_reason"] == "DONE" else False}}
# Return code to be executed in the environment
return ("import time; time.sleep(2.222)", action)
@agent_action
def scroll(self, element_description: str, clicks: int, shift: bool = False):
"""Scroll the element in the specified direction
Args:
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
shift:bool, whether to use shift+scroll for horizontal scrolling
"""
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
action = {"function": "scroll", "args": {"x": x, "y": y, "clicks": clicks, "shift": shift}}
if shift:
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", action)
else:
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", action)
@agent_action
def hotkey(self, keys: List):
"""Press a hotkey combination (can press a single key as well)
Args:
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
"""
# add quotes around the keys
keys = [f"'{key}'" for key in keys]
keys_string = " ".join(keys)
action = {"function": "key", "args": {"keys": keys_string}}
return (f"import pyautogui; pyautogui.hotkey({', '.join(keys)});", action)
@agent_action
def hold_and_press(self, hold_keys: List, press_keys: List):
"""Hold a list of keys and press a list of keys
Args:
hold_keys:List, list of keys to hold
press_keys:List, list of keys to press in a sequence
"""
press_keys_str = "[" + ", ".join([f"'{key}'" for key in press_keys]) + "]"
command = "import pyautogui; "
for k in hold_keys:
command += f"pyautogui.keyDown({repr(k)}); "
command += f"pyautogui.press({press_keys_str}); "
for k in hold_keys:
command += f"pyautogui.keyUp({repr(k)}); "
hold_keys_string = " ".join(hold_keys)
press_keys_string = " ".join(press_keys)
action = {"function": "key", "args": {"keys": hold_keys_string + ";" + press_keys_string}}
return (command, action)
@agent_action
def wait(self, time: float):
"""Wait for a specified amount of time
Args:
time:float, the amount of time to wait in seconds
"""
return (f"""import time; time.sleep({time});""", {"function": "wait", "args": {}})
@agent_action
def done(
self,
):
"""
End the current task with a success. Use this when you believe the entire task has been fully completed. You must ensure all visual information aligns with the user's true intent.
"""
return ("""DONE""", {"function": "done", "args": {}})
@agent_action
def fail(self):
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete."""
return ("""FAIL""", {"function": "fail", "args": {}})
@agent_action
def call_search_agent(
self,
query: str,
):
"""
Calls a specialized 'Searcher Agent' to find a detailed, step-by-step tutorial on the internet for a specific GUI action.
Args:
query:str, the search phrase or question for the tutorial. The formulation of this query is critical for success and must follow the guidelines below.
**Query Formulation Guidelines:**
Your query must be a well-defined question targeting a **single, specific action** within a **specific application**. To get the best results, adhere to these rules:
1. **Start with "How to":** Your query must begin with the phrase "How to" to frame it as a request for instructions.
2. **Include the Application Name:** Always specify the name of the software you are working in (e.g., "GIMP", "Google Chrome", "Libreoffice Writer").
3. **Focus on a Single Intent:** The query should represent one clear goal. Do not combine multiple steps or tasks into one query.
4. **Be Specific, Not Abstract:** Ask a concrete question. Avoid repeating the user's high-level or abstract instructions.
5. **Decompose Complex Tasks:** If the user's overall instruction involves multiple actions (e.g., "download a file and then email it"), and you are stuck on one part, search *only for that specific part*.
**Examples:**
* **User's Overall Instruction:** "Please help me download my latest bank statement and then send it to my accountant."
* **Correct Query (if stuck on downloading):** "How to download a bank statement from the Bank of America website?"
* **Correct Query (if stuck on attaching a file):** "How to attach a file to an email in Gmail?"
* **Incorrect Query:** "Download my bank statement and email it to my accountant" *(This query is too broad, contains multiple sub-tasks, and does not start with "How to".)*
"""
logger.info("=" * 50)
logger.info(f"ACI: Calling Search Agent(query={query})")
logger.info("=" * 50)
self.searcher_agent.result_dir = self.result_dir
result = self.searcher_agent.search(query=query, main_obs=self.obs)
self.last_search_agent_result = result
if result["completion_reason"] == "DONE":
self.tutorials.append(result["final_answer"])
action = {"function": "call_search_agent", "args": {"query": query, "result": True if result["completion_reason"] == "DONE" else False}}
return ("import time; time.sleep(2.222)", action)

View File

@@ -0,0 +1,61 @@
import logging
import platform
from typing import Dict, List, Tuple
from mm_agents.os_symphony.agents.os_aci import OSACI
from mm_agents.os_symphony.agents.searcher_agent import VLMSearcherAgent
from mm_agents.os_symphony.agents.worker import Worker
logger = logging.getLogger("desktopenv.agent")
class OSSymphony:
def __init__(
self,
engine_params_for_orchestrator: Dict,
engine_params_for_memoryer: Dict,
os_aci: OSACI,
platform: str = platform.system().lower(),
client_password: str = "",
max_trajectory_length: int = 8,
enable_reflection: bool = True,
):
"""
Args:
worker_engine_params: Configuration parameters for the worker agent.
grounding_agent: Instance of ACI class for UI interaction
platform: Operating system platform (darwin, linux, windows)
max_trajectory_length: Maximum number of image turns to keep
enable_reflection: Creates a reflection agent to assist the worker agent
"""
self.engine_params_for_orchestrator = engine_params_for_orchestrator
self.engine_params_for_memoryer = engine_params_for_memoryer
self.os_aci: OSACI = os_aci
self.platform =platform
self.client_password = client_password
self.max_trajectory_length = max_trajectory_length
self.enable_reflection = enable_reflection
def reset(self, result_dir) -> None:
"""Reset agent state and initialize components"""
# Reset the search time per task
self.os_aci.result_dir = result_dir
self.executor = Worker(
engine_params_for_orchestrator=self.engine_params_for_orchestrator,
engine_params_for_memoryer=self.engine_params_for_memoryer,
os_aci=self.os_aci,
platform=self.platform,
client_password=self.client_password,
max_trajectory_length=self.max_trajectory_length,
enable_reflection=self.enable_reflection,
)
def predict(self, instruction: str, observation: Dict, is_last_step: bool) -> Tuple[Dict, List[str]]:
# Initialize the three info dictionaries
executor_info, actions = self.executor.generate_next_action(
instruction=instruction, obs=observation, is_last_step=is_last_step
)
# concatenate the three info dictionaries
info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
return info, actions

View File

@@ -0,0 +1,478 @@
import logging
import urllib.parse
from typing import Any, Dict, List, Optional
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
from mm_agents.os_symphony.utils.common_utils import (
draw_coordinates,
call_llm_formatted,
parse_code_from_string,
create_pyautogui_code
)
from mm_agents.os_symphony.core.mllm import LMMAgent
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
import os
import time
import json
logger = logging.getLogger("desktopenv.searcher_agent")
# Agent action decorator
def searcher_agent_action(func):
func.is_searcher_agent_action = True
return func
# --- Abstract Base Class and Factory ---
class SearcherAgent:
def __init__(self, engine_params: Dict, platform: str):
self.engine_params = engine_params
self.result_dir = ""
self.tutorial_or_hint = ""
self.tutorial_notes = []
self.max_trajectory_length = 8
self.platform = platform
self.budget = engine_params.get("budget", 20)
@staticmethod
def create(engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str="password"):
searcher_type = engine_params.get("type", "vlm")
if searcher_type == "vlm":
return VLMSearcherAgent(engine_params=engine_params, search_env=search_env, grounder_agent=grounder_agent, platform=platform, client_password=client_password)
else:
raise NotImplementedError
def _get_search_time(self) -> int:
"""for the name of result directory"""
if not self.result_dir: return 1
search_times: list[int] = []
try:
if not os.path.exists(self.result_dir): return 1
for item_name in os.listdir(self.result_dir):
full_path = os.path.join(self.result_dir, item_name)
if os.path.isdir(full_path) and item_name.startswith("search_"):
try:
time_val = int(item_name.split('_', 1)[1])
search_times.append(time_val)
except (ValueError, IndexError):
continue
except Exception:
return 1
if not search_times: return 1
return max(search_times) + 1
def search(self, query: str, obs) -> str:
"""
Args:
query: Format like "How to xxxx?", must be a detailed subtask
obs: Current screenshot
"""
raise NotImplementedError("Subclasses must implement the 'search' method")
class VLMSearcherAgent(SearcherAgent):
"""
Start a new, isolated vm, and open chrome in advance
"""
def __init__(self, engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str):
SearcherAgent.__init__(self, engine_params=engine_params, platform=platform)
self.grounder_agent = grounder_agent
self.client_password = client_password
self.env = search_env
self.use_thinking = engine_params.get("model", "") in [
"claude-opus-4-20250514",
"claude-sonnet-4-20250514",
"claude-3-7-sonnet-20250219",
"claude-sonnet-4-5-20250929",
]
self.engine = engine_params.get("engine", "google")
# Reuse OSWorld's initialization script to set up Chrome, then directly perform a Google search using the query—currently, the query can be substituted by a placeholder field.
self.task_config = {
"id": "searcher",
"instruction": "searcher",
"config": [
{
"type": "launch",
"parameters": {
"command": [
"google-chrome",
"--remote-debugging-port=1337"
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"socat",
"tcp-listen:9222,fork",
"tcp:localhost:1337"
]
}
},
{
"type": "chrome_open_tabs",
"parameters": {
"urls_to_open": [
"GOOGLE_SEARCH_URL"
]
}
},
{
"type": "activate_window",
"parameters": {
"window_name": "Google Chrome"
}
}
],
"proxy": True
}
self.obs = None
def reset(self, query):
# When the search function is invoked, a new agent is created; the environment is instantiated only upon the first call, but it must be reset on every invocation.
self.tutorial_notes = []
self.tutorial_or_hint = ""
self.system_prompt = PROCEDURAL_MEMORY.construct_vlm_searcher_procedural_memory(
agent_class=type(self)
).replace("CURRENT_OS", self.platform).replace("QUERY", query)
self.searcher_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=self.system_prompt
)
self.env.start()
# config URL and initialize search environment (google/duckduckgo)
search_url = f"https://www.google.com/search?q=" + urllib.parse.quote_plus(query) if self.engine == "google" else f"https://www.duckduckgo.com/?q=" + urllib.parse.quote_plus(query)
self.task_config["config"][2]["parameters"]["urls_to_open"][0] = search_url
self.env.reset(task_config=self.task_config)
print("[Searcher] sleeping...")
time.sleep(5)
def flush_messages(self):
"""Flush messages based on the model's context limits.
This method ensures that the agent's message history does not exceed the maximum trajectory length.
Side Effects:
- Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
"""
engine_type = self.engine_params.get("engine_type", "")
# Flush strategy for long-context models: keep all text, only keep latest images
if engine_type in ["anthropic", "openai", "gemini"]:
max_images = self.max_trajectory_length
for agent in [self.searcher_agent]:
if agent is None:
continue
# keep latest k images
# @Yang: keep the first main agent image
img_count = 0
for i in range(len(agent.messages) - 1, 1, -1):
for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
if "image" in agent.messages[i]["content"][j].get("type", ""):
img_count += 1
if img_count > max_images:
del agent.messages[i]["content"][j]
# Flush strategy for non-long-context models: drop full turns
else:
# generator msgs are alternating [user, assistant], so 2 per round
if len(self.searcher_agent.messages) > 2 * self.max_trajectory_length + 1:
self.searcher_agent.messages.pop(1)
self.searcher_agent.messages.pop(1)
def assign_screenshot(self, obs):
self.obs = obs
def search(self, query: str, main_obs):
# only create vm when search is called
self.reset(query=query) # reset
search_result_dir = os.path.join(self.result_dir, f"search_{self._get_search_time()}")
os.makedirs(search_result_dir, exist_ok=True)
obs = self.env._get_obs() # Get the initial observation
step_idx = 0
initial_state_text = (
"This screenshot shows the current visual context of the main GUI Agent you are assisting. "
"Use this image to understand the application, the current view, and the overall environment. "
"Your primary goal is to find a tutorial that is highly relevant and well-aligned with this specific context, "
"ensuring the instructions you find are applicable to what the main agent is currently seeing."
)
self.searcher_agent.add_message(
text_content=initial_state_text,
image_content=main_obs["screenshot"],
role="user"
)
execution_history = []
completion_reason = ""
final_answer = ""
while step_idx < self.budget:
# update system_prompt dynamically
tutorial_notes_str = ""
if len(self.tutorial_notes) > 0:
for i, note in enumerate(self.tutorial_notes, 1):
tutorial_notes_str += f"Tutorial Note {i}: {note}\n\n"
if step_idx == self.budget - 1:
# eager mode
self.system_prompt = PROCEDURAL_MEMORY.construct_searcher_eager_mode_procedural_memory(
agent_class=type(self)
).replace("CURRENT_OS", self.platform).replace("QUERY", query)
system_prompt = self.system_prompt.replace("TUTORIAL_PLACEHOLDER", tutorial_notes_str)
self.searcher_agent.add_system_prompt(system_prompt=system_prompt)
# start a new turn
self.assign_screenshot(obs=obs)
generator_message = ""
self.searcher_agent.add_message(
generator_message, image_content=obs["screenshot"], role="user"
)
format_checkers = []
# predict action
plan = call_llm_formatted(
self.searcher_agent,
format_checkers,
temperature=self.engine_params.get("temperture", 0.1),
use_thinking=self.use_thinking,
)
self.searcher_agent.add_message(plan, role="assistant")
execution_history.append(plan)
logger.info("SEARCHER PLAN:\n %s", plan)
plan_code = parse_code_from_string(plan)
try:
assert plan_code, "Plan code should not be empty"
# exec_code e.g. import pyautogui; pyautogui.click(1, 2);
exec_code, coords = create_pyautogui_code(self, plan_code, obs)
except Exception as e:
logger.error(
f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
)
exec_code = self.wait(
1.333
) # Skip a turn if the code cannot be evaluated
self.flush_messages()
# execute action
action = exec_code
logger.info("Step %d: %s", step_idx + 1, action)
# Save screenshot and trajectory information
with open(os.path.join(search_result_dir, f"step_{step_idx + 1}.png"),
"wb") as _f:
_f.write(obs['screenshot'])
if coords is not None and isinstance(coords, list):
draw_coordinates(
image_bytes=obs['screenshot'],
coordinates=coords,
save_path=os.path.join(search_result_dir, f"step_{step_idx + 1}_draw.png")
)
with open(os.path.join(search_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
f.write(json.dumps({
"query": query,
"step_num": step_idx + 1,
"action": action,
"response": {
"plan": plan,
"plan_code": plan_code,
"coordinates": coords
},
"screenshot_file": f"step_{step_idx + 1}.png"
}, ensure_ascii=False))
f.write("\n")
with open(os.path.join(search_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
json.dump({
"query": query,
"step_num": step_idx + 1,
"action": action,
"response": {
"plan": plan,
"plan_code": plan_code,
"coordinates": coords
},
"screenshot_file": f"step_{step_idx + 1}.png"
}, f, indent=4, ensure_ascii=False)
if exec_code in ["DONE", "FAIL"]:
# terminate loop
completion_reason = exec_code
final_answer = self.tutorial_or_hint
break
else:
obs, _, _, _ = self.env.step(action, 5)
step_idx += 1
if completion_reason == "":
completion_reason = "BUDGET_EXHAUSTED"
final_answer = "Sorry, can't get the useful tutorial about the GUI task you provided."
return {
"query": query,
"completion_reason": completion_reason,
"tutorial_notes": self.tutorial_notes,
"execution_history": execution_history,
"steps_executed": step_idx,
"budget": self.budget,
"final_answer": final_answer,
}
@searcher_agent_action
def click(
self,
element_description: str,
num_clicks: int = 1,
button_type: str = "left",
):
"""Click on the element
Args:
element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.
num_clicks:int, number of times to click the element
button_type:str, which mouse button to press can be "left", "middle", or "right"
"""
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
command = "import pyautogui; "
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
# Return pyautoguicode to click on the element
return (command, [x, y])
@searcher_agent_action
def type(
self,
element_description: Optional[str] = None,
text: str = "",
overwrite: bool = True,
enter: bool = False
):
"""Type text/unicode into a specific element
Args:
element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.
text:str, the text to type
overwrite:bool, Default is True, assign it to False if the text should not overwrite the existing text. Using this argument clears all text in an element.
enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
"""
commands = (
"import os;"
"import pyautogui;"
"import pyperclip;"
"import subprocess;"
"import time;"
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
)
click_coords = None
if element_description is not None:
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
click_coords = [x, y]
commands += f"pyautogui.click({x}, {y});"
if overwrite:
commands += (
f"pyautogui.hotkey('ctrl', 'a');"
"pyautogui.press('backspace');"
)
# use paste to input
commands += (
"original_clipboard = pyperclip.paste();"
f"pyperclip.copy({repr(text)});"
"pyautogui.hotkey('ctrl', 'v');"
"pyperclip.copy(original_clipboard);"
)
if enter:
commands += "pyautogui.press('enter');"
if click_coords is not None:
return (commands, click_coords)
else:
return commands
@searcher_agent_action
def scroll(self, element_description: str, clicks: int, shift: bool = False):
"""Scroll the element in the specified direction
Args:
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
shift:bool, whether to use shift+scroll for horizontal scrolling
"""
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
if shift:
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", [x, y])
else:
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", [x, y])
@searcher_agent_action
def hotkey(self, keys: List):
"""Press a hotkey combination (can press a single key as well)
Args:
keys: List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
"""
# add quotes around the keys
keys = [f"'{key}'" for key in keys]
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)})"
@searcher_agent_action
def save_to_tutorial_notes(self, text: str):
"""Save high quality and useful information to a long-term knowledge bank for reuse during this search task.
Args:
text:str, the text to save to the tutorial notes
"""
self.tutorial_notes.append(text)
return """WAIT"""
@searcher_agent_action
def wait(self, time: float):
"""Wait for a specified amount of time
Args:
time:float the amount of time to wait in seconds
"""
return f"""import time; time.sleep({time})"""
@searcher_agent_action
def done(
self,
tutorial: str
):
"""End the current task with a success. Use this when you believe the entire task has been fully completed.
Args:
tutorial:str, A detailed, step-by-step tutorial compiled from the search results to be passed to the main agent.
"""
self.tutorial_or_hint = tutorial
return """DONE"""
@searcher_agent_action
def fail(
self,
hint: str
):
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete.
Args:
hint:str, A hint or reason explaining why the search failed, or what kind of information was missing.
"""
self.tutorial_or_hint = hint
return """FAIL"""

View File

@@ -0,0 +1,340 @@
from functools import partial
import logging
from typing import Dict, List, Tuple
from mm_agents.os_symphony.agents.memoryer_agent import ReflectionMemoryAgent
from mm_agents.os_symphony.agents.os_aci import OSACI
from mm_agents.os_symphony.core.module import BaseModule
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
from mm_agents.os_symphony.utils.common_utils import (
call_llm_formatted,
extract_coords_from_action_dict,
parse_action_from_string,
parse_code_from_string,
create_pyautogui_code,
)
from mm_agents.os_symphony.utils.formatters import (
SINGLE_ACTION_FORMATTER,
CODE_VALID_FORMATTER,
)
logger = logging.getLogger("desktopenv.agent")
class Worker(BaseModule):
def __init__(
self,
engine_params_for_orchestrator: Dict,
engine_params_for_memoryer: Dict,
os_aci: OSACI,
platform: str,
client_password: str,
max_trajectory_length: int = 8,
enable_reflection: bool = True,
):
"""
Worker receives the main task and generates actions, without the need of hierarchical planning
Args:
worker_engine_params: Dict
Parameters for the worker agent
os_aci: Agent
The grounding agent to use
platform: str
OS platform the agent runs on (darwin, linux, windows)
max_trajectory_length: int
The amount of images turns to keep
enable_reflection: bool
Whether to enable reflection
"""
super().__init__(platform=platform)
self.client_password = client_password
self.temperature = engine_params_for_orchestrator.get("temperature", 0.0)
self.tool_config = engine_params_for_orchestrator.get("tool_config", "")
self.use_thinking = engine_params_for_orchestrator.get("model", "") in [
"claude-opus-4-20250514",
"claude-sonnet-4-20250514",
"claude-3-7-sonnet-20250219",
"claude-sonnet-4-5-20250929",
]
self.engine_params_for_orchestrator = engine_params_for_orchestrator
self.engine_params_for_memoryer = engine_params_for_memoryer
self.os_aci: OSACI = os_aci
self.max_trajectory_length = max_trajectory_length if not self.engine_params_for_orchestrator.get("keep_first_image", False) else max_trajectory_length - 1
self.enable_reflection = enable_reflection
self.reset()
def reset(self):
# set_cell_values only occurs in linux; meanwhile there is no fail option in the other benchmarks
if self.platform in ["windows", "macos"]:
skipped_actions = ["set_cell_values", "fail"]
else:
skipped_actions = []
# Hide code agent action entirely if no env/controller is available
if not getattr(self.os_aci, "env", None) or not getattr(
getattr(self.os_aci, "env", None), "controller", None
):
skipped_actions.append("call_code_agent")
self.orchestrator_sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(
agent_class=type(self.os_aci),
skipped_actions=skipped_actions,
tool_config=self.tool_config,
platform=self.platform
).replace("CURRENT_OS", self.platform).replace("CLIENT_PASSWORD", self.client_password)
# Worker contains orchestrator and reflection agent
self.orchestrator_agent = self._create_agent(
engine_params=self.engine_params_for_orchestrator,
system_prompt=self.orchestrator_sys_prompt
)
self.memoryer_agent = ReflectionMemoryAgent(self.engine_params_for_memoryer)
self.instruction = None
self.turn_count = 0
self.worker_history = []
self.coords_history = []
# For loop detection
self.action_dict_history = []
def flush_messages(self):
"""Flush messages based on the model's context limits.
This method ensures that the agent's message history does not exceed the maximum trajectory length.
Side Effects:
- Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
"""
engine_type = self.engine_params_for_orchestrator.get("engine_type", "")
# Flush strategy for long-context models: keep all text, only keep latest images
if engine_type in ["anthropic", "openai", "gemini", "vllm"]:
max_images = self.max_trajectory_length
# for agent in [self.generator_agent, self.reflection_agent]:
for agent in [self.orchestrator_agent]:
if agent is None:
continue
# keep latest k images
img_count = 0
stop_idx = 1 if self.engine_params_for_orchestrator.get("keep_first_image", False) else -1
for i in range(len(agent.messages) - 1, stop_idx, -1):
# for j in range(len(agent.messages[i]["content"])):
for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
if "image" in agent.messages[i]["content"][j].get("type", ""):
img_count += 1
if img_count > max_images:
del agent.messages[i]["content"][j]
# Flush strategy for non-long-context models: drop full turns
else:
# generator msgs are alternating [user, assistant], so 2 per round
if len(self.orchestrator_agent.messages) > 2 * self.max_trajectory_length + 1:
self.orchestrator_agent.messages.pop(1)
self.orchestrator_agent.messages.pop(1)
def generate_next_action(self, instruction: str, obs: Dict, is_last_step: bool) -> Tuple[Dict, List]:
"""
Predict the next action(s) based on the current observation.
"""
print("=" * 30, f"Turn {self.turn_count + 1}", "=" * 30)
print("=" * 10)
print(instruction)
print("=" * 10)
self.os_aci.assign_screenshot(obs)
self.os_aci.set_task_instruction(instruction)
generator_message = (
""
if self.turn_count > 0
else "The initial screen is provided. No action has been taken yet."
)
# Load the task into the system prompt
if is_last_step:
# Eager mode: must decide done / fail
prompt_with_instructions = PROCEDURAL_MEMORY.construct_eager_mode_procedural_memory(agent_class=type(self.os_aci)).replace(
"TASK_DESCRIPTION", instruction
).replace(
"CURRENT_OS", self.platform
)
print(f'Eager Mode Started, Instruction: {prompt_with_instructions}')
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
generator_message += "Note: 'EAGER MODE' is enabled. You must determine whether the task is done or fail in this step!!!"
else:
tutorials = ""
for idx, t in enumerate(self.os_aci.tutorials, start=1):
tutorials += f"### Tutorial {idx}:\n {t}\n"
prompt_with_instructions = self.orchestrator_sys_prompt.replace(
"TASK_DESCRIPTION", instruction
).replace(
"TUTORIAL_PLACEHOLDER", tutorials
)
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
# print(self.orchestrator_agent.system_prompt)
### Reflection Part
reflection_info = {}
if self.enable_reflection:
# set instruction to memory agent
self.memoryer_agent.add_instruction(instruction)
reflection = None
# Differentiate the operation mode of last step
last_code_summary = ""
mode = "gui"
if (
hasattr(self.os_aci, "last_code_agent_result")
and self.os_aci.last_code_agent_result is not None
):
# If code agent is called last step, we use its execution result as step behavior.
code_result = self.os_aci.last_code_agent_result
mode = "code"
last_code_summary += f"Subtask Instruction: {code_result['task_instruction']}\nSteps Completed: {code_result['steps_executed']}\nCompletion Reason: {code_result['completion_reason']}\nExec Summary: {code_result['summary']}\n"
if (
hasattr(self.os_aci, "last_search_agent_result")
and self.os_aci.last_search_agent_result is not None
):
mode = "search"
# retrieve reflection!!!
reflection_info = self.memoryer_agent.get_reflection(
cur_obs=obs,
# only use the string after "(next action)" in orchestrator's output
generator_output=parse_action_from_string(self.worker_history[-1]) if self.turn_count != 0 else "",
coordinates=self.coords_history[-1] if self.turn_count != 0 else [],
mode=mode,
code_exec_summary=last_code_summary,
action_dict=self.action_dict_history[-1] if self.turn_count != 0 else {}
)
reflection = reflection_info['reflection']
logger.info(f'[Reflection]: {reflection}')
if reflection:
generator_message += f"REFLECTION: You MUST use this reflection on the latest action:\n{reflection}\n"
else:
generator_message += "You should go on with your plan.\n"
else:
generator_message += "You should go on with your plan.\n"
# Add code agent result from previous step if available (from full task or subtask execution)
if (
hasattr(self.os_aci, "last_code_agent_result")
and self.os_aci.last_code_agent_result is not None
):
code_result = self.os_aci.last_code_agent_result
generator_message += f"\nCODE AGENT RESULT:\n"
generator_message += (
f"Task/Subtask Instruction: {code_result['task_instruction']}\n"
)
generator_message += f"Steps Completed: {code_result['steps_executed']}\n"
generator_message += f"Max Steps: {code_result['budget']}\n"
generator_message += (
f"Completion Reason: {code_result['completion_reason']}\n"
)
generator_message += f"Summary: {code_result['summary']}\n"
generator_message += "\n"
# Reset the code agent result after adding it to context
self.os_aci.last_code_agent_result = None
if (
hasattr(self.os_aci, "last_search_agent_result")
and self.os_aci.last_search_agent_result is not None
):
# Retrieve the result dictionary
search_result = self.os_aci.last_search_agent_result
# Add a clear, distinct header for this section in the prompt
generator_message += f"\nSEARCH AGENT RESULT:\n"
# Add contextual metadata from the search task
generator_message += f"Search Query: {search_result['query']}\n"
generator_message += f"Search Completion Reason: {search_result['completion_reason']}\n"
generator_message += "Search Result: "
# Add the most important part: the tutorial found by the agent.
# This is given a prominent sub-header so the LLM knows to pay close attention.
if search_result["completion_reason"] == "DONE":
generator_message += f'Search is completed, the tutorial it found has been already added to your system prompt.\n'
elif search_result["completion_reason"] == "FAIL":
generator_message += f"Search is fail, the failure reason or the hint is as follow: {search_result['final_answer']}\n"
# CRITICAL: Reset the search agent result after adding it to the context.
# This prevents it from being added to the prompt again in the next turn.
self.os_aci.last_search_agent_result = None
# Finalize the generator message
self.orchestrator_agent.add_message(
generator_message, image_content=obs["screenshot"], role="user", put_text_last=True
)
# Generate the plan and next action
format_checkers = [
SINGLE_ACTION_FORMATTER,
partial(CODE_VALID_FORMATTER, self.tool_config),
]
plan = call_llm_formatted(
self.orchestrator_agent,
format_checkers,
temperature=self.engine_params_for_orchestrator.get("temperture", 0.1),
use_thinking=self.use_thinking,
)
self.worker_history.append(plan)
self.orchestrator_agent.add_message(plan, role="assistant")
logger.info("PLAN:\n %s", plan)
# Extract the next action from the plan
# 此时的plan code e.g. agent.click('xxxxx', 1)
plan_code = parse_code_from_string(plan)
action_dict, coordinates = None, None
try:
assert plan_code, "Plan code should not be empty"
# exec_code e.g. import pyautogui; pyautogui.click(1, 2);
exec_code, action_dict = create_pyautogui_code(self.os_aci, plan_code, obs)
coordinates = extract_coords_from_action_dict(action_dict)
except Exception as e:
logger.error(
f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
)
exec_code, action_dict = self.os_aci.wait(
1.333
) # Skip a turn if the code cannot be evaluated
self.action_dict_history.append(action_dict)
executor_info = {
"refined_instruction": self.instruction,
"plan": plan,
"plan_code": plan_code,
"exec_code": exec_code,
"coordinates": coordinates,
"reflection": reflection_info,
"code_agent_output": (
self.os_aci.last_code_agent_result
if hasattr(self.os_aci, "last_code_agent_result")
and self.os_aci.last_code_agent_result is not None
else None
),
"search_agent_output": (
self.os_aci.last_search_agent_result
if hasattr(self.os_aci, "last_search_agent_result")
and self.os_aci.last_search_agent_result is not None
else None
)
}
self.turn_count += 1
self.coords_history.append(coordinates)
self.flush_messages()
return executor_info, [exec_code]