341 lines
15 KiB
Python
Executable File
341 lines
15 KiB
Python
Executable File
from functools import partial
|
|
import logging
|
|
from typing import Dict, List, Tuple
|
|
|
|
from mm_agents.os_symphony.agents.memoryer_agent import ReflectionMemoryAgent
|
|
from mm_agents.os_symphony.agents.os_aci import OSACI
|
|
from mm_agents.os_symphony.core.module import BaseModule
|
|
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
|
from mm_agents.os_symphony.utils.common_utils import (
|
|
call_llm_formatted,
|
|
extract_coords_from_action_dict,
|
|
parse_action_from_string,
|
|
parse_code_from_string,
|
|
create_pyautogui_code,
|
|
)
|
|
from mm_agents.os_symphony.utils.formatters import (
|
|
SINGLE_ACTION_FORMATTER,
|
|
CODE_VALID_FORMATTER,
|
|
)
|
|
|
|
|
|
logger = logging.getLogger("desktopenv.agent")
|
|
|
|
|
|
class Worker(BaseModule):
|
|
def __init__(
|
|
self,
|
|
engine_params_for_orchestrator: Dict,
|
|
engine_params_for_memoryer: Dict,
|
|
os_aci: OSACI,
|
|
platform: str,
|
|
client_password: str,
|
|
max_trajectory_length: int = 8,
|
|
enable_reflection: bool = True,
|
|
):
|
|
"""
|
|
Worker receives the main task and generates actions, without the need of hierarchical planning
|
|
Args:
|
|
worker_engine_params: Dict
|
|
Parameters for the worker agent
|
|
os_aci: Agent
|
|
The grounding agent to use
|
|
platform: str
|
|
OS platform the agent runs on (darwin, linux, windows)
|
|
max_trajectory_length: int
|
|
The amount of images turns to keep
|
|
enable_reflection: bool
|
|
Whether to enable reflection
|
|
"""
|
|
super().__init__(platform=platform)
|
|
self.client_password = client_password
|
|
|
|
self.temperature = engine_params_for_orchestrator.get("temperature", 0.0)
|
|
self.tool_config = engine_params_for_orchestrator.get("tool_config", "")
|
|
self.use_thinking = engine_params_for_orchestrator.get("model", "") in [
|
|
"claude-opus-4-20250514",
|
|
"claude-sonnet-4-20250514",
|
|
"claude-3-7-sonnet-20250219",
|
|
"claude-sonnet-4-5-20250929",
|
|
]
|
|
self.engine_params_for_orchestrator = engine_params_for_orchestrator
|
|
self.engine_params_for_memoryer = engine_params_for_memoryer
|
|
self.os_aci: OSACI = os_aci
|
|
|
|
self.max_trajectory_length = max_trajectory_length if not self.engine_params_for_orchestrator.get("keep_first_image", False) else max_trajectory_length - 1
|
|
self.enable_reflection = enable_reflection
|
|
self.reset()
|
|
|
|
def reset(self):
|
|
# set_cell_values only occurs in linux; meanwhile there is no fail option in the other benchmarks
|
|
if self.platform in ["windows", "macos"]:
|
|
skipped_actions = ["set_cell_values", "fail"]
|
|
else:
|
|
skipped_actions = []
|
|
|
|
# Hide code agent action entirely if no env/controller is available
|
|
if not getattr(self.os_aci, "env", None) or not getattr(
|
|
getattr(self.os_aci, "env", None), "controller", None
|
|
):
|
|
skipped_actions.append("call_code_agent")
|
|
|
|
self.orchestrator_sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(
|
|
agent_class=type(self.os_aci),
|
|
skipped_actions=skipped_actions,
|
|
tool_config=self.tool_config,
|
|
platform=self.platform
|
|
).replace("CURRENT_OS", self.platform).replace("CLIENT_PASSWORD", self.client_password)
|
|
|
|
# Worker contains orchestrator and reflection agent
|
|
self.orchestrator_agent = self._create_agent(
|
|
engine_params=self.engine_params_for_orchestrator,
|
|
system_prompt=self.orchestrator_sys_prompt
|
|
|
|
)
|
|
self.memoryer_agent = ReflectionMemoryAgent(self.engine_params_for_memoryer)
|
|
|
|
self.instruction = None
|
|
self.turn_count = 0
|
|
self.worker_history = []
|
|
self.coords_history = []
|
|
|
|
# For loop detection
|
|
self.action_dict_history = []
|
|
|
|
def flush_messages(self):
|
|
"""Flush messages based on the model's context limits.
|
|
|
|
This method ensures that the agent's message history does not exceed the maximum trajectory length.
|
|
|
|
Side Effects:
|
|
- Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
|
|
"""
|
|
engine_type = self.engine_params_for_orchestrator.get("engine_type", "")
|
|
|
|
# Flush strategy for long-context models: keep all text, only keep latest images
|
|
if engine_type in ["anthropic", "openai", "gemini", "vllm"]:
|
|
max_images = self.max_trajectory_length
|
|
# for agent in [self.generator_agent, self.reflection_agent]:
|
|
for agent in [self.orchestrator_agent]:
|
|
if agent is None:
|
|
continue
|
|
# keep latest k images
|
|
img_count = 0
|
|
stop_idx = 1 if self.engine_params_for_orchestrator.get("keep_first_image", False) else -1
|
|
for i in range(len(agent.messages) - 1, stop_idx, -1):
|
|
# for j in range(len(agent.messages[i]["content"])):
|
|
for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
|
|
if "image" in agent.messages[i]["content"][j].get("type", ""):
|
|
img_count += 1
|
|
if img_count > max_images:
|
|
del agent.messages[i]["content"][j]
|
|
|
|
# Flush strategy for non-long-context models: drop full turns
|
|
else:
|
|
# generator msgs are alternating [user, assistant], so 2 per round
|
|
if len(self.orchestrator_agent.messages) > 2 * self.max_trajectory_length + 1:
|
|
self.orchestrator_agent.messages.pop(1)
|
|
self.orchestrator_agent.messages.pop(1)
|
|
|
|
|
|
def generate_next_action(self, instruction: str, obs: Dict, is_last_step: bool) -> Tuple[Dict, List]:
|
|
"""
|
|
Predict the next action(s) based on the current observation.
|
|
"""
|
|
print("=" * 30, f"Turn {self.turn_count + 1}", "=" * 30)
|
|
|
|
print("=" * 10)
|
|
print(instruction)
|
|
print("=" * 10)
|
|
|
|
self.os_aci.assign_screenshot(obs)
|
|
self.os_aci.set_task_instruction(instruction)
|
|
|
|
|
|
generator_message = (
|
|
""
|
|
if self.turn_count > 0
|
|
else "The initial screen is provided. No action has been taken yet."
|
|
)
|
|
|
|
|
|
# Load the task into the system prompt
|
|
if is_last_step:
|
|
# Eager mode: must decide done / fail
|
|
prompt_with_instructions = PROCEDURAL_MEMORY.construct_eager_mode_procedural_memory(agent_class=type(self.os_aci)).replace(
|
|
"TASK_DESCRIPTION", instruction
|
|
).replace(
|
|
"CURRENT_OS", self.platform
|
|
)
|
|
print(f'Eager Mode Started, Instruction: {prompt_with_instructions}')
|
|
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
|
|
generator_message += "Note: 'EAGER MODE' is enabled. You must determine whether the task is done or fail in this step!!!"
|
|
else:
|
|
tutorials = ""
|
|
for idx, t in enumerate(self.os_aci.tutorials, start=1):
|
|
tutorials += f"### Tutorial {idx}:\n {t}\n"
|
|
|
|
prompt_with_instructions = self.orchestrator_sys_prompt.replace(
|
|
"TASK_DESCRIPTION", instruction
|
|
).replace(
|
|
"TUTORIAL_PLACEHOLDER", tutorials
|
|
)
|
|
|
|
self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
|
|
|
|
# print(self.orchestrator_agent.system_prompt)
|
|
|
|
### Reflection Part
|
|
reflection_info = {}
|
|
if self.enable_reflection:
|
|
# set instruction to memory agent
|
|
self.memoryer_agent.add_instruction(instruction)
|
|
reflection = None
|
|
# Differentiate the operation mode of last step
|
|
last_code_summary = ""
|
|
mode = "gui"
|
|
if (
|
|
hasattr(self.os_aci, "last_code_agent_result")
|
|
and self.os_aci.last_code_agent_result is not None
|
|
):
|
|
# If code agent is called last step, we use its execution result as step behavior.
|
|
code_result = self.os_aci.last_code_agent_result
|
|
mode = "code"
|
|
last_code_summary += f"Subtask Instruction: {code_result['task_instruction']}\nSteps Completed: {code_result['steps_executed']}\nCompletion Reason: {code_result['completion_reason']}\nExec Summary: {code_result['summary']}\n"
|
|
|
|
if (
|
|
hasattr(self.os_aci, "last_search_agent_result")
|
|
and self.os_aci.last_search_agent_result is not None
|
|
):
|
|
mode = "search"
|
|
# retrieve reflection!!!
|
|
reflection_info = self.memoryer_agent.get_reflection(
|
|
cur_obs=obs,
|
|
# only use the string after "(next action)" in orchestrator's output
|
|
generator_output=parse_action_from_string(self.worker_history[-1]) if self.turn_count != 0 else "",
|
|
coordinates=self.coords_history[-1] if self.turn_count != 0 else [],
|
|
mode=mode,
|
|
code_exec_summary=last_code_summary,
|
|
action_dict=self.action_dict_history[-1] if self.turn_count != 0 else {}
|
|
)
|
|
reflection = reflection_info['reflection']
|
|
logger.info(f'[Reflection]: {reflection}')
|
|
if reflection:
|
|
generator_message += f"REFLECTION: You MUST use this reflection on the latest action:\n{reflection}\n"
|
|
else:
|
|
generator_message += "You should go on with your plan.\n"
|
|
else:
|
|
generator_message += "You should go on with your plan.\n"
|
|
|
|
|
|
# Add code agent result from previous step if available (from full task or subtask execution)
|
|
if (
|
|
hasattr(self.os_aci, "last_code_agent_result")
|
|
and self.os_aci.last_code_agent_result is not None
|
|
):
|
|
code_result = self.os_aci.last_code_agent_result
|
|
generator_message += f"\nCODE AGENT RESULT:\n"
|
|
generator_message += (
|
|
f"Task/Subtask Instruction: {code_result['task_instruction']}\n"
|
|
)
|
|
generator_message += f"Steps Completed: {code_result['steps_executed']}\n"
|
|
generator_message += f"Max Steps: {code_result['budget']}\n"
|
|
generator_message += (
|
|
f"Completion Reason: {code_result['completion_reason']}\n"
|
|
)
|
|
generator_message += f"Summary: {code_result['summary']}\n"
|
|
generator_message += "\n"
|
|
# Reset the code agent result after adding it to context
|
|
self.os_aci.last_code_agent_result = None
|
|
|
|
if (
|
|
hasattr(self.os_aci, "last_search_agent_result")
|
|
and self.os_aci.last_search_agent_result is not None
|
|
):
|
|
# Retrieve the result dictionary
|
|
search_result = self.os_aci.last_search_agent_result
|
|
|
|
# Add a clear, distinct header for this section in the prompt
|
|
generator_message += f"\nSEARCH AGENT RESULT:\n"
|
|
|
|
# Add contextual metadata from the search task
|
|
generator_message += f"Search Query: {search_result['query']}\n"
|
|
generator_message += f"Search Completion Reason: {search_result['completion_reason']}\n"
|
|
generator_message += "Search Result: "
|
|
# Add the most important part: the tutorial found by the agent.
|
|
# This is given a prominent sub-header so the LLM knows to pay close attention.
|
|
if search_result["completion_reason"] == "DONE":
|
|
generator_message += f'Search is completed, the tutorial it found has been already added to your system prompt.\n'
|
|
elif search_result["completion_reason"] == "FAIL":
|
|
generator_message += f"Search is fail, the failure reason or the hint is as follow: {search_result['final_answer']}\n"
|
|
|
|
|
|
# CRITICAL: Reset the search agent result after adding it to the context.
|
|
# This prevents it from being added to the prompt again in the next turn.
|
|
self.os_aci.last_search_agent_result = None
|
|
|
|
|
|
# Finalize the generator message
|
|
self.orchestrator_agent.add_message(
|
|
generator_message, image_content=obs["screenshot"], role="user", put_text_last=True
|
|
)
|
|
|
|
# Generate the plan and next action
|
|
format_checkers = [
|
|
SINGLE_ACTION_FORMATTER,
|
|
partial(CODE_VALID_FORMATTER, self.tool_config),
|
|
]
|
|
plan = call_llm_formatted(
|
|
self.orchestrator_agent,
|
|
format_checkers,
|
|
temperature=self.engine_params_for_orchestrator.get("temperture", 0.1),
|
|
use_thinking=self.use_thinking,
|
|
)
|
|
self.worker_history.append(plan)
|
|
self.orchestrator_agent.add_message(plan, role="assistant")
|
|
logger.info("PLAN:\n %s", plan)
|
|
|
|
# Extract the next action from the plan
|
|
# 此时的plan code e.g. agent.click('xxxxx', 1)
|
|
plan_code = parse_code_from_string(plan)
|
|
action_dict, coordinates = None, None
|
|
try:
|
|
assert plan_code, "Plan code should not be empty"
|
|
# exec_code e.g. import pyautogui; pyautogui.click(1, 2);
|
|
exec_code, action_dict = create_pyautogui_code(self.os_aci, plan_code, obs)
|
|
coordinates = extract_coords_from_action_dict(action_dict)
|
|
except Exception as e:
|
|
logger.error(
|
|
f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
|
|
)
|
|
exec_code, action_dict = self.os_aci.wait(
|
|
1.333
|
|
) # Skip a turn if the code cannot be evaluated
|
|
|
|
self.action_dict_history.append(action_dict)
|
|
|
|
executor_info = {
|
|
"refined_instruction": self.instruction,
|
|
"plan": plan,
|
|
"plan_code": plan_code,
|
|
"exec_code": exec_code,
|
|
"coordinates": coordinates,
|
|
"reflection": reflection_info,
|
|
"code_agent_output": (
|
|
self.os_aci.last_code_agent_result
|
|
if hasattr(self.os_aci, "last_code_agent_result")
|
|
and self.os_aci.last_code_agent_result is not None
|
|
else None
|
|
),
|
|
"search_agent_output": (
|
|
self.os_aci.last_search_agent_result
|
|
if hasattr(self.os_aci, "last_search_agent_result")
|
|
and self.os_aci.last_search_agent_result is not None
|
|
else None
|
|
)
|
|
}
|
|
self.turn_count += 1
|
|
self.coords_history.append(coordinates)
|
|
self.flush_messages()
|
|
return executor_info, [exec_code]
|