from ast import parse import logging import json from typing import List, Dict, Any, Optional, Tuple from mm_agents.os_symphony.utils.common_utils import ( call_llm_formatted, enhance_observation, parse_code_from_string ) from functools import partial from mm_agents.os_symphony.utils.formatters import JSON_ANSWER_FORMATTER from mm_agents.os_symphony.core.mllm import LMMAgent from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY import imagehash import io import os from PIL import Image import numpy as np from skimage.metrics import structural_similarity as ssim logger = logging.getLogger("desktopenv.agent") class StepBehavior: """ Narrative Step Behavior. Description of each step, cosists of generative agent (main agent)'s output, screenshot (if this step is milestone), and textual description. The textual description shows that how the agent thought and did, and how the state changes. """ def __init__(self, is_milestone: bool, gen_output: str, summary: str, obs: Dict, action_dict: Dict): self.is_milestone = is_milestone self.gen_output = gen_output self.obs = obs self.summary = summary self.action_dict = action_dict # Variants for opyimizing the time complexity of loop detection # --- 1. pHash --- self.phash = None # --- 2. SSIM --- self.ssim_list = [] def _update_phash_ssim(self, history: List): # Calculate the ssim_list of current obs # Update pHash cur_img = Image.open(io.BytesIO(self.obs["screenshot"])) cur_img_gray = cur_img.convert('L') cur_img_np = np.array(cur_img_gray) self.phash = imagehash.phash(cur_img) # Update ssim_list for hs in history: compare_img = Image.open(io.BytesIO(hs.obs["screenshot"])) compare_img_gray = compare_img.convert('L') compare_img_np = np.array(compare_img_gray) self.ssim_list.append(ssim(cur_img_np, compare_img_np, data_range=cur_img_np.max() - compare_img_np.min())) class ReflectionMemoryAgent: """ Reflection Memory Agent (RMA). Responsible for maintaining long-term memory, extracting narratives from trajectories, providing reflections to the Main Agent, and validating task completion status. """ def __init__(self, engine_params: Dict): """ Initialize the RMA. Args: - engine_params: { "engine_type": args.provider, "model": args.model, "base_url": args.model_url, "api_key": args.model_api_key, "temperature": getattr(args, "model_temperature", None), } - max_img_len: max image number to use in reflection process """ self.engine_params = engine_params self.max_images = engine_params.get('max_images', 8) self.memoryer_level = engine_params.get('memoryer_level', 3) self.reset() logger.info(f"ReflectionMemoryAgent initialized with:\n {self.engine_params}") def reset(self): """Reset the code agent state.""" logger.debug("Resetting RMA state") self.instruction = None self.trajectory: List[StepBehavior] = [] self.knowledge_base: List[str] = [] self.last_code_step_idx = -1 ''' Control the count of images, we only use the maximum number of max_img_len images. The update logic: the 0-th screenshot is always retained. If the total number of screenshots is less than max_img_len, all are kept; otherwise, starting from index 1, milestone screenshots are managed via FIFO. ''' self.active_img_idx = [] self.reflection_agent = LMMAgent( engine_params=self.engine_params, system_prompt=PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT, ) self.behavior_agent = LMMAgent( engine_params=self.engine_params, system_prompt=PROCEDURAL_MEMORY.SUMMARIZE_STEP_SYSTEM_PROMPT ) def add_instruction(self, instruction): """ [Interface] Main -> RMA Main agent set the instruction to RMA. """ self.instruction = instruction def _update_trajectory(self, step_behavior): self.trajectory.append(step_behavior) if len(self.active_img_idx) >= self.max_images: if step_behavior.is_milestone: self.active_img_idx.append(len(self.trajectory) - 1) # over max_img_len,only milestone image del self.active_img_idx[1] # FIFO starts from index 1 else: self.active_img_idx.append(len(self.trajectory) - 1) # less than max_img_len, feed all images assert len(self.active_img_idx) <= self.max_images, "[RMA] Errors in updating StepBehavior!!" def _summarize_step_behavior( self, generator_output: str, cur_obs: Dict, enhanced_obs: bytes | None, is_milestone: bool, mode: str = "gui", code_exec_summary: str = "", action_dict: Dict = {} ) -> Tuple[StepBehavior, bool]: """ [Interface] Main -> RMA The Main Agent (MA) calls this method to "feed" the information of the just-completed step to the RMA. RMA will internally process and store this step. """ if mode == "search": is_success = "successful" # summary is fixed step_behavior = StepBehavior( False, generator_output, "Search Agent was called last step, and a tutorial has been generated.", cur_obs, action_dict ) elif mode == "code": self.last_code_step_idx = len(self.trajectory) is_success = "successful" # the summary returned by the code agent step_behavior = StepBehavior( False, generator_output, f"Code Agent was called last step, and the summary of its trajectory is: \n---\n{code_exec_summary}\n---", cur_obs, action_dict ) else: # common gui operation, use LLM to summarize prev_obs = self.trajectory[-1].obs text_content = f"""Computer Use Agent's Output: \n{generator_output}""" self.behavior_agent.reset() # don't need history messages updated_sys_prompt = ( self.behavior_agent.system_prompt + "\n" + text_content ) self.behavior_agent.add_system_prompt(updated_sys_prompt) self.behavior_agent.add_message( text_content="This is the observation before executing action (attached below).", image_content=prev_obs['screenshot'], role="user", put_text_last=False ) self.behavior_agent.add_message( text_content="This is the zoom-in view, which may help you to identify the operational region (attached below).", image_content=enhanced_obs, role="user", put_text_last=False ) self.behavior_agent.add_message( text_content="This is the observation after executing action (attached below).", image_content=cur_obs['screenshot'], role="user", put_text_last=False ) required_fields = ["summary", "evaluation"] format_checkers = [ partial(JSON_ANSWER_FORMATTER, required_fields) ] full_response = call_llm_formatted( self.behavior_agent, format_checkers, temperature=self.engine_params.get("temperture", 0.1), ) response = parse_code_from_string(full_response) try: data = json.loads(response) behavior_summary = data['summary'] is_success = data["evaluation"] except Exception as e: print("[RMA] Errors in generating step summary: ", e) logger.info("Response is not a JSON object or miss required keys!") behavior_summary = response is_success = "successful" step_behavior = StepBehavior(is_milestone, generator_output, behavior_summary, cur_obs, action_dict) return step_behavior, is_success == "successful" def get_reflection( self, cur_obs: Dict, generator_output: str, coordinates: List, mode: str="gui", code_exec_summary: str = "", action_dict: Dict = {} ) -> Dict: """ [Interface] RMA -> Main The Main Agent (MA) calls this method to get RMA's reflection before deciding the next action. Args: - cur_obs (Dict): The Main Agent's current observation (o_k). - generator_output (str): The thoughts, screen analysis and action of Main Agent. - coordinates (List): coordinates in the last operation step of Main Agent. - mode(str): [gui, code, search]. Indicate which agent that main agent called last step. - code_exec_summary: execution summary for code agent. - action_dict: extracted action from generator output. Returns: - reflection_info(Dict): all the info related to reflection """ if self.memoryer_level == 0: return { "reflection": None, "reflection_thoughts": None, "existing_knowledge": None, "is_milestone": False, "new_knowledge": None, "step_summary": None, "hint": { "gui_operation_error": False, "lack_of_tutorial": False, "code_error": False, "loop_detection": None, } } reflection = None reflection_thought = None if len(self.trajectory) == 0: step_behavior = StepBehavior( True, "The initial screen is provided. No action has been taken yet.", "The initial screen is provided. No action has been taken yet.", cur_obs, action_dict ) step_behavior._update_phash_ssim(self.trajectory) self._update_trajectory(step_behavior) reflection_info = { "reflection": reflection, "reflection_thoughts": reflection_thought, "existing_knowledge": "\n".join(self.knowledge_base), "is_milestone": True, "new_knowledge": "", "step_summary": "", "loop_detection": None } else: ### Step Summary prev_obs = self.trajectory[-1].obs enhanced_obs = None if coordinates: enhanced_obs, _, _, _, _ = enhance_observation( prev_obs["screenshot"], coordinates, draw=True ) # generate step behavior step_behavior, last_gui_check = self._summarize_step_behavior( generator_output, cur_obs, enhanced_obs, False, mode, code_exec_summary, action_dict ) step_behavior._update_phash_ssim(self.trajectory) ### make additional hints additional_hints = [] if not last_gui_check: additional_hints.append(f"\t- Warning: The last GUI operation is unsuccessful. Careful review is required to avoid GUI Operation Error.") code_error_hint = False if self.last_code_step_idx != -1 and len(self.trajectory) - self.last_code_step_idx < 0: code_error_hint = True additional_hints.append(f"\t- Warning: The Computer Use Agent might in the verification stage of Code Agent. Careful review is required to avoid Code Error.") # loop detection from mm_agents.os_symphony.utils.loop_detection import detect_loop is_loop, loop_details = detect_loop(full_trajectory=self.trajectory, N=3) if is_loop and loop_details: match_sequence_indices = loop_details["match_sequence_indices"] loop_hint_message = f"\t- Warning: A potential LOOP has been detected between Step {match_sequence_indices[0]} and Step {match_sequence_indices[-1]}. Careful review is required to avoid Repetitive Behavior Error." additional_hints.append(loop_hint_message) self.reflection_agent.reset() updated_sys_prompt = ( PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT + "\n\n" + f"---\n- **user instruction**: {self.instruction}\n" + "- **existing knowledge**: \n" + "\n".join(self.knowledge_base) + "\n- **additional_hints**: " + "\n".join(additional_hints) + "\n---" ) # update system prompt self.reflection_agent.add_system_prompt(updated_sys_prompt) for i, step in enumerate(self.trajectory): text_content = f"""### (Step {i}) history:\nsummary: '''\n{step.summary}\n'''""" if i in self.active_img_idx: if i == 0: text_content += f"\ninitial screenshot:" else: text_content += f"\nscreenshot (after executing action): (attached below)" self.reflection_agent.add_message( text_content=text_content, image_content=step.obs['screenshot'] if i in self.active_img_idx else None, role="user", put_text_last=False ) text_content = f"""### (Last Step) CUA's output (has been finished):\n---\n{generator_output}\n---\nStep Summary:\n---\n{step_behavior.summary}\n---\nlatest_screenshot: (attached below)""" self.reflection_agent.add_message( text_content=text_content, image_content=cur_obs['screenshot'], role="user", put_text_last=False ) required_fields = ["is_milestone", "reflection", "knowledge"] format_checkers = [ partial(JSON_ANSWER_FORMATTER, required_fields) ] full_response = call_llm_formatted( self.reflection_agent, format_checkers ) reflection_thought = full_response response = parse_code_from_string(full_response) try: data = json.loads(response) reflection = data['reflection'] is_milestone = data["is_milestone"] knowledge = data['knowledge'] except Exception as e: print("[RMA] Errors in dealing with reflection: ", e) logger.info("Response is not a JSON object or miss required keys!") reflection = response is_milestone = False knowledge = "" if len(knowledge) > 0: self.knowledge_base.append(knowledge) if isinstance(is_milestone, str): is_milestone = True if "true" in is_milestone.lower() else False # update trajectory and is_milestone self._update_trajectory(step_behavior) if mode == "gui": # only gui opration can be considered as milestone self.trajectory[-1].is_milestone = is_milestone reflection_info = { "reflection": reflection, "reflection_thoughts": reflection_thought, "existing_knowledge": "\n".join(self.knowledge_base), "is_milestone": is_milestone, "new_knowledge": knowledge, "step_summary": step_behavior.summary, "hint": { "gui_operation_error": not last_gui_check, "lack_of_tutorial": is_loop, "code_error": code_error_hint, "loop_detection": loop_details, } } return reflection_info