Files
sci-gui-agent-benchmark/mm_agents/os_symphony/agents/memoryer_agent.py
2025-12-23 14:30:44 +08:00

428 lines
17 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from ast import parse
import logging
import json
from typing import List, Dict, Any, Optional, Tuple
from mm_agents.os_symphony.utils.common_utils import (
call_llm_formatted,
enhance_observation,
parse_code_from_string
)
from functools import partial
from mm_agents.os_symphony.utils.formatters import JSON_ANSWER_FORMATTER
from mm_agents.os_symphony.core.mllm import LMMAgent
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
import imagehash
import io
import os
from PIL import Image
import numpy as np
from skimage.metrics import structural_similarity as ssim
logger = logging.getLogger("desktopenv.agent")
class StepBehavior:
"""
Narrative Step Behavior.
Description of each step, cosists of generative agent (main agent)'s output, screenshot (if this step is milestone), and textual description.
The textual description shows that how the agent thought and did, and how the state changes.
"""
def __init__(self, is_milestone: bool, gen_output: str, summary: str, obs: Dict, action_dict: Dict):
self.is_milestone = is_milestone
self.gen_output = gen_output
self.obs = obs
self.summary = summary
self.action_dict = action_dict
# Variants for opyimizing the time complexity of loop detection
# --- 1. pHash ---
self.phash = None
# --- 2. SSIM ---
self.ssim_list = []
def _update_phash_ssim(self, history: List):
# Calculate the ssim_list of current obs
# Update pHash
cur_img = Image.open(io.BytesIO(self.obs["screenshot"]))
cur_img_gray = cur_img.convert('L')
cur_img_np = np.array(cur_img_gray)
self.phash = imagehash.phash(cur_img)
# Update ssim_list
for hs in history:
compare_img = Image.open(io.BytesIO(hs.obs["screenshot"]))
compare_img_gray = compare_img.convert('L')
compare_img_np = np.array(compare_img_gray)
self.ssim_list.append(ssim(cur_img_np, compare_img_np, data_range=cur_img_np.max() - compare_img_np.min()))
class ReflectionMemoryAgent:
"""
Reflection Memory Agent (RMA).
Responsible for maintaining long-term memory, extracting narratives from trajectories,
providing reflections to the Main Agent, and validating task completion status.
"""
def __init__(self, engine_params: Dict):
"""
Initialize the RMA.
Args:
- engine_params:
{
"engine_type": args.provider,
"model": args.model,
"base_url": args.model_url,
"api_key": args.model_api_key,
"temperature": getattr(args, "model_temperature", None),
}
- max_img_len: max image number to use in reflection process
"""
self.engine_params = engine_params
self.max_images = engine_params.get('max_images', 8)
self.memoryer_level = engine_params.get('memoryer_level', 3)
self.reset()
logger.info(f"ReflectionMemoryAgent initialized with:\n {self.engine_params}")
def reset(self):
"""Reset the code agent state."""
logger.debug("Resetting RMA state")
self.instruction = None
self.trajectory: List[StepBehavior] = []
self.knowledge_base: List[str] = []
self.last_code_step_idx = -1
'''
Control the count of images, we only use the maximum number of max_img_len images.
The update logic: the 0-th screenshot is always retained. If the total number of screenshots is less than max_img_len, all are kept; otherwise, starting from index 1, milestone screenshots are managed via FIFO.
'''
self.active_img_idx = []
self.reflection_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT,
)
self.behavior_agent = LMMAgent(
engine_params=self.engine_params,
system_prompt=PROCEDURAL_MEMORY.SUMMARIZE_STEP_SYSTEM_PROMPT
)
def add_instruction(self, instruction):
"""
[Interface] Main -> RMA
Main agent set the instruction to RMA.
"""
self.instruction = instruction
def _update_trajectory(self, step_behavior):
self.trajectory.append(step_behavior)
if len(self.active_img_idx) >= self.max_images:
if step_behavior.is_milestone:
self.active_img_idx.append(len(self.trajectory) - 1) # over max_img_lenonly milestone image
del self.active_img_idx[1] # FIFO starts from index 1
else:
self.active_img_idx.append(len(self.trajectory) - 1) # less than max_img_len, feed all images
assert len(self.active_img_idx) <= self.max_images, "[RMA] Errors in updating StepBehavior!!"
def _summarize_step_behavior(
self,
generator_output: str,
cur_obs: Dict,
enhanced_obs: bytes | None,
is_milestone: bool,
mode: str = "gui",
code_exec_summary: str = "",
action_dict: Dict = {}
) -> Tuple[StepBehavior, bool]:
"""
[Interface] Main -> RMA
The Main Agent (MA) calls this method to "feed" the information of the just-completed step to the RMA.
RMA will internally process and store this step.
"""
if mode == "search":
is_success = "successful"
# summary is fixed
step_behavior = StepBehavior(
False,
generator_output,
"Search Agent was called last step, and a tutorial has been generated.",
cur_obs,
action_dict
)
elif mode == "code":
self.last_code_step_idx = len(self.trajectory)
is_success = "successful"
# the summary returned by the code agent
step_behavior = StepBehavior(
False,
generator_output,
f"Code Agent was called last step, and the summary of its trajectory is: \n---\n{code_exec_summary}\n---",
cur_obs,
action_dict
)
else: # common gui operation, use LLM to summarize
prev_obs = self.trajectory[-1].obs
text_content = f"""Computer Use Agent's Output: \n{generator_output}"""
self.behavior_agent.reset() # don't need history messages
updated_sys_prompt = (
self.behavior_agent.system_prompt + "\n" + text_content
)
self.behavior_agent.add_system_prompt(updated_sys_prompt)
self.behavior_agent.add_message(
text_content="This is the observation before executing action (attached below).",
image_content=prev_obs['screenshot'],
role="user",
put_text_last=False
)
self.behavior_agent.add_message(
text_content="This is the zoom-in view, which may help you to identify the operational region (attached below).",
image_content=enhanced_obs,
role="user",
put_text_last=False
)
self.behavior_agent.add_message(
text_content="This is the observation after executing action (attached below).",
image_content=cur_obs['screenshot'],
role="user",
put_text_last=False
)
required_fields = ["summary", "evaluation"]
format_checkers = [
partial(JSON_ANSWER_FORMATTER, required_fields)
]
full_response = call_llm_formatted(
self.behavior_agent,
format_checkers,
temperature=self.engine_params.get("temperture", 0.1),
)
response = parse_code_from_string(full_response)
try:
data = json.loads(response)
behavior_summary = data['summary']
is_success = data["evaluation"]
except Exception as e:
print("[RMA] Errors in generating step summary: ", e)
logger.info("Response is not a JSON object or miss required keys!")
behavior_summary = response
is_success = "successful"
step_behavior = StepBehavior(is_milestone, generator_output, behavior_summary, cur_obs, action_dict)
return step_behavior, is_success == "successful"
def get_reflection(
self,
cur_obs: Dict,
generator_output: str,
coordinates: List,
mode: str="gui",
code_exec_summary: str = "",
action_dict: Dict = {}
) -> Dict:
"""
[Interface] RMA -> Main
The Main Agent (MA) calls this method to get RMA's reflection before deciding the next action.
Args:
- cur_obs (Dict): The Main Agent's current observation (o_k).
- generator_output (str): The thoughts, screen analysis and action of Main Agent.
- coordinates (List): coordinates in the last operation step of Main Agent.
- mode(str): [gui, code, search]. Indicate which agent that main agent called last step.
- code_exec_summary: execution summary for code agent.
- action_dict: extracted action from generator output.
Returns:
- reflection_info(Dict): all the info related to reflection
"""
if self.memoryer_level == 0:
return {
"reflection": None,
"reflection_thoughts": None,
"existing_knowledge": None,
"is_milestone": False,
"new_knowledge": None,
"step_summary": None,
"hint": {
"gui_operation_error": False,
"lack_of_tutorial": False,
"code_error": False,
"loop_detection": None,
}
}
reflection = None
reflection_thought = None
if len(self.trajectory) == 0:
step_behavior = StepBehavior(
True,
"The initial screen is provided. No action has been taken yet.",
"The initial screen is provided. No action has been taken yet.",
cur_obs,
action_dict
)
step_behavior._update_phash_ssim(self.trajectory)
self._update_trajectory(step_behavior)
reflection_info = {
"reflection": reflection,
"reflection_thoughts": reflection_thought,
"existing_knowledge": "\n".join(self.knowledge_base),
"is_milestone": True,
"new_knowledge": "",
"step_summary": "",
"loop_detection": None
}
else:
### Step Summary
prev_obs = self.trajectory[-1].obs
enhanced_obs = None
if coordinates:
enhanced_obs, _, _, _, _ = enhance_observation(
prev_obs["screenshot"],
coordinates,
draw=True
)
# generate step behavior
step_behavior, last_gui_check = self._summarize_step_behavior(
generator_output,
cur_obs,
enhanced_obs,
False,
mode,
code_exec_summary,
action_dict
)
step_behavior._update_phash_ssim(self.trajectory)
### make additional hints
additional_hints = []
if not last_gui_check:
additional_hints.append(f"\t- Warning: The last GUI operation is unsuccessful. Careful review is required to avoid GUI Operation Error.")
code_error_hint = False
if self.last_code_step_idx != -1 and len(self.trajectory) - self.last_code_step_idx < 0:
code_error_hint = True
additional_hints.append(f"\t- Warning: The Computer Use Agent might in the verification stage of Code Agent. Careful review is required to avoid Code Error.")
# loop detection
from mm_agents.os_symphony.utils.loop_detection import detect_loop
is_loop, loop_details = detect_loop(full_trajectory=self.trajectory, N=3)
if is_loop and loop_details:
match_sequence_indices = loop_details["match_sequence_indices"]
loop_hint_message = f"\t- Warning: A potential LOOP has been detected between Step {match_sequence_indices[0]} and Step {match_sequence_indices[-1]}. Careful review is required to avoid Repetitive Behavior Error."
additional_hints.append(loop_hint_message)
self.reflection_agent.reset()
updated_sys_prompt = (
PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT + "\n\n" +
f"---\n- **user instruction**: {self.instruction}\n" +
"- **existing knowledge**: \n" + "\n".join(self.knowledge_base) +
"\n- **additional_hints**: " + "\n".join(additional_hints) + "\n---"
)
# update system prompt
self.reflection_agent.add_system_prompt(updated_sys_prompt)
for i, step in enumerate(self.trajectory):
text_content = f"""### (Step {i}) history:\nsummary: '''\n{step.summary}\n'''"""
if i in self.active_img_idx:
if i == 0:
text_content += f"\ninitial screenshot:"
else:
text_content += f"\nscreenshot (after executing action): (attached below)"
self.reflection_agent.add_message(
text_content=text_content,
image_content=step.obs['screenshot'] if i in self.active_img_idx else None,
role="user",
put_text_last=False
)
text_content = f"""### (Last Step) CUA's output (has been finished):\n---\n{generator_output}\n---\nStep Summary:\n---\n{step_behavior.summary}\n---\nlatest_screenshot: (attached below)"""
self.reflection_agent.add_message(
text_content=text_content,
image_content=cur_obs['screenshot'],
role="user",
put_text_last=False
)
required_fields = ["is_milestone", "reflection", "knowledge"]
format_checkers = [
partial(JSON_ANSWER_FORMATTER, required_fields)
]
full_response = call_llm_formatted(
self.reflection_agent,
format_checkers
)
reflection_thought = full_response
response = parse_code_from_string(full_response)
try:
data = json.loads(response)
reflection = data['reflection']
is_milestone = data["is_milestone"]
knowledge = data['knowledge']
except Exception as e:
print("[RMA] Errors in dealing with reflection: ", e)
logger.info("Response is not a JSON object or miss required keys!")
reflection = response
is_milestone = False
knowledge = ""
if len(knowledge) > 0:
self.knowledge_base.append(knowledge)
if isinstance(is_milestone, str):
is_milestone = True if "true" in is_milestone.lower() else False
# update trajectory and is_milestone
self._update_trajectory(step_behavior)
if mode == "gui": # only gui opration can be considered as milestone
self.trajectory[-1].is_milestone = is_milestone
reflection_info = {
"reflection": reflection,
"reflection_thoughts": reflection_thought,
"existing_knowledge": "\n".join(self.knowledge_base),
"is_milestone": is_milestone,
"new_knowledge": knowledge,
"step_summary": step_behavior.summary,
"hint": {
"gui_operation_error": not last_gui_check,
"lack_of_tutorial": is_loop,
"code_error": code_error_hint,
"loop_detection": loop_details,
}
}
return reflection_info