add_os_symphony (#399)

2025-12-23 14:30:44 +08:00
parent ac31778ee3
commit f593f35b1c
26 changed files with 6674 additions and 0 deletions
--- a/mm_agents/os_symphony/agents/init.py
+++ b/mm_agents/os_symphony/agents/init.py
--- a/mm_agents/os_symphony/agents/coder_agent.py
+++ b/mm_agents/os_symphony/agents/coder_agent.py
@@ -0,0 +1,350 @@
+import logging
+from typing import Dict, List, Tuple, Optional
+
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+from mm_agents.os_symphony.utils.common_utils import call_llm_safe, parse_code_from_string
+from mm_agents.os_symphony.core.mllm import LMMAgent
+
+logger = logging.getLogger("desktopenv.coder_agent")
+
+
+def extract_code_block(action: str) -> Tuple[Optional[str], Optional[str]]:
+    """Extract code and determine type from action string."""
+    if "```python" in action:
+        code_type = "python"
+        code = action.split("```python")[1].split("```")[0].strip()
+    elif "```bash" in action:
+        code_type = "bash"
+        code = action.split("```bash")[1].split("```")[0].strip()
+    elif "```" in action:
+        code_type = None
+        code = action.split("```")[1].split("```")[0].strip()
+    else:
+        code_type = None
+        code = None
+
+    logger.debug(
+        f"Extracted code block: type={code_type}, length={len(code) if code else 0}"
+    )
+    return code_type, code
+
+
+def execute_code(code_type: str, code: str, env_controller) -> Dict:
+    """Execute code based on its type."""
+    # Log the full code being executed (untruncated)
+    logger.info(f"CODING_AGENT_CODE_EXECUTION - Type: {code_type}\nCode:\n{code}")
+
+    try:
+        if code_type == "bash":
+            result = env_controller.run_bash_script(code, timeout=30)
+        elif code_type == "python":
+            result = env_controller.run_python_script(code)
+        else:
+            result = {"status": "error", "error": f"Unknown code type: {code_type}"}
+
+        return result
+
+    except Exception as e:
+        logger.error(f"Error executing {code_type} code: {e}")
+        return {"status": "error", "error": str(e)}
+
+
+def format_result(result: Dict, step_count: int) -> str:
+    """Format execution result into context string."""
+    if not result:
+        logger.warning(f"Step {step_count + 1}: No result returned from execution")
+        return f"""
+Step {step_count + 1} Error:
+Error: No result returned from execution
+"""
+
+    status = result.get("status", "unknown")
+    return_code = result.get("returncode", result.get("return_code", -1))
+
+    # Handle different response structures for bash vs python
+    if "returncode" in result:
+        # Bash script response
+        output = result.get("output", "")  # Contains both stdout and stderr merged
+        error = result.get("error", "")  # Always empty for bash
+    else:
+        # Python script response
+        output = result.get("output", "")  # stdout only
+        error = result.get("error", "")  # stderr only
+
+    logger.debug(f"Step {step_count + 1}: Status={status}, Return Code={return_code}")
+
+    # Format with better structure for multi-line outputs
+    result_text = f"Step {step_count + 1} Result:\n"
+    result_text += f"Status: {status}\n"
+    result_text += f"Return Code: {return_code}\n"
+
+    if output:
+        result_text += f"Output:\n{output}\n"
+
+    if error:
+        result_text += f"Error:\n{error}\n"
+
+    return result_text
+
+
+class CoderAgent:
+    """A dedicated agent for executing code with a budget of steps."""
+
+    def __init__(self, engine_params: Dict, client_password: str, platform: str = "linux"):
+        """Initialize the CodeAgent."""
+        if not engine_params:
+            raise ValueError("engine_params cannot be None or empty")
+
+        self.engine_params = engine_params
+        self.budget = engine_params.get("budget", 20)
+        self.temperature = engine_params.get("temperature", 0.1)
+        self.agent = None
+        self.platform = platform
+        self.client_password = client_password
+
+        logger.info(f"CodeAgent initialized with budget={self.budget} and platform={self.platform}")
+        self.reset()
+
+    def reset(self):
+        """Reset the code agent state."""
+        logger.debug("Resetting CodeAgent state")
+        self.agent = LMMAgent(
+            engine_params=self.engine_params,
+            system_prompt=PROCEDURAL_MEMORY.construct_coder_procedural_memory(platform=self.platform, client_password=self.client_password)
+        )
+
+    def execute(self, task_instruction: str, screenshot: str, env_controller) -> Dict:
+        """Execute code for the given task with a budget of steps."""
+        if env_controller is None:
+            raise ValueError("env_controller is required for code execution")
+
+        print(f"\n🚀 STARTING CODE EXECUTION")
+        print("=" * 60)
+        print(f"Task: {task_instruction}")
+        print(f"Budget: {self.budget} steps")
+        print("=" * 60)
+
+        logger.info(f"Starting code execution for task: {task_instruction}")
+        logger.info(f"Budget: {self.budget} steps")
+
+        self.reset()
+
+
+        # Add initial task instruction and screenshot context as user message
+        context = (
+            f"Task: {task_instruction}\n\nCurrent screenshot is provided for context."
+        )
+        self.agent.add_message(context, image_content=screenshot, role="user")
+
+        step_count = 0
+        execution_history = []
+        execution_result_history = []
+        while step_count < self.budget:
+            logger.info(f"Step {step_count + 1}/{self.budget}")
+
+            # Get assistant response (thoughts and code)
+            response = call_llm_safe(self.agent, temperature=self.temperature)
+
+            # Print to terminal for immediate visibility
+            # print(f"\n🤖 CODING AGENT RESPONSE - Step {step_count + 1}/{self.budget}")
+            # print("=" * 60)
+            # print(response)
+            # print("=" * 60)
+
+            # Log the latest message from the coding agent (untruncated)
+            logger.info(
+                f"CODING_AGENT_LATEST_MESSAGE - Step {step_count + 1}:\n{response}"
+            )
+
+            # Check if response is None or empty
+            if not response or response.strip() == "":
+                error_msg = f"Step {step_count + 1}: LLM returned empty response"
+                logger.error(error_msg)
+                raise RuntimeError(error_msg)
+
+            # Parse the response to extract action
+            action = parse_code_from_string(response)
+            thoughts = response
+
+            execution_history.append(
+                {"step": step_count + 1, "action": action, "thoughts": thoughts}
+            )
+
+            # Check for completion signals
+            action_upper = action.upper().strip()
+            if action_upper == "DONE":
+                print(f"\n✅ TASK COMPLETED - Step {step_count + 1}")
+                print("=" * 60)
+                print("Agent signaled task completion")
+                print("=" * 60)
+                logger.info(f"Step {step_count + 1}: Task completed successfully")
+                completion_reason = "DONE"
+                break
+            elif action_upper == "FAIL":
+                print(f"\n❌ TASK FAILED - Step {step_count + 1}")
+                print("=" * 60)
+                print("Agent signaled task failure")
+                print("=" * 60)
+                logger.info(f"Step {step_count + 1}: Task failed by agent request")
+                completion_reason = "FAIL"
+                break
+            elif action_upper == 'INFEASIBLE':     
+                print(f"\n❌ TASK INFEASIBLE - Step {step_count + 1}")
+                print("=" * 60)
+                print("Agent signaled task infeasible")
+                print("=" * 60)
+                logger.info(f"Step {step_count + 1}: Task infeasible by agent request")
+                completion_reason = "INFEASIBLE"
+                break
+
+            # Extract and execute code
+            code_type, code = extract_code_block(response.split("(Answer)")[-1])     
+
+            if code:
+                result = execute_code(code_type, code, env_controller)
+                execution_result_history.append(
+                    {"step": step_count + 1, "result": result}
+                )
+                # Prepare formatted output and error for logging
+                output = result.get("output", "")
+                error = result.get("error", "")
+                message = result.get("message", "")
+                status = result.get("status", "")
+
+                # Print execution result to terminal for immediate visibility
+                print(f"\n⚡ CODE EXECUTION RESULT - Step {step_count + 1}")
+                print("-" * 50)
+                print(f"Status: {status}")
+                if output:
+                    print(f"Output:\n{output}")
+                if error:
+                    print(f"Error:\n{error}")
+                if message and not output and not error:
+                    print(f"Message:\n{message}")
+                print("-" * 50)
+
+                log_lines = [
+                    f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:",
+                    f"Status: {status}" if status else None,
+                ]
+
+                if output:
+                    log_lines.append(
+                        "Output:\n" + ("-" * 40) + f"\n{output}\n" + ("-" * 40)
+                    )
+                if error:
+                    log_lines.append(
+                        "Error:\n" + ("!" * 40) + f"\n{error}\n" + ("!" * 40)
+                    )
+                if message and not output and not error:
+                    log_lines.append(
+                        "Message:\n" + ("-" * 40) + f"\n{message}\n" + ("-" * 40)
+                    )
+
+                # Remove None entries and join
+                formatted_log = "\n".join([line for line in log_lines if line])
+                logger.info(formatted_log)
+            else:
+                print(f"\n⚠️  NO CODE BLOCK FOUND - Step {step_count + 1}")
+                print("-" * 50)
+                print("Action did not contain executable code")
+                print("-" * 50)
+
+                logger.warning(f"Step {step_count + 1}: No code block found in action")
+                result = {"status": "skipped", "message": "No code block found"}
+                logger.info(
+                    f"CODING_AGENT_EXECUTION_RESULT - Step {step_count + 1}:\n"
+                    f"Status: skipped\n"
+                    f"Message:\n{'-' * 40}\n{result['message']}\n{'-' * 40}"
+                )
+            # Add assistant's thoughts and code to message history
+            self.agent.add_message(response, role="assistant")
+
+            # Process result and add formatted environment results as user message
+            result_context = format_result(result, step_count)
+            self.agent.add_message(result_context, role="user")
+
+            step_count += 1
+
+        # Handle budget exhaustion
+        if "completion_reason" not in locals():
+            print(f"\n⏰ BUDGET EXHAUSTED - {step_count} steps completed")
+            print("=" * 60)
+            print(f"Maximum budget of {self.budget} steps reached")
+            print("=" * 60)
+            logger.info(f"Budget exhausted after {step_count} steps")
+            completion_reason = f"BUDGET_EXHAUSTED_AFTER_{step_count}_STEPS"
+
+        # Generate final summary
+        logger.info("Generating execution summary")
+        summary = self._generate_summary(execution_history, task_instruction)
+
+        result = {
+            "task_instruction": task_instruction,
+            "completion_reason": completion_reason,
+            "summary": summary,
+            "execution_history": execution_history,
+            "execution_result_history": execution_result_history,
+            "steps_executed": step_count,
+            "budget": self.budget
+        }
+
+        logger.info(f"Code execution completed: steps={step_count}")
+        return result
+
+    def _generate_summary(
+        self, execution_history: List[Dict], task_instruction: str
+    ) -> str:
+        """Generate summary of code execution session."""
+        if not execution_history:
+            logger.info("No execution history to summarize")
+            return "No actions were executed."
+
+        logger.info(f"Generated summary for {len(execution_history)} steps")
+
+        # Build detailed execution context for summary agent
+        execution_context = f"Task: {task_instruction}\n\nExecution Steps:\n"
+
+        for step in execution_history:
+            step_num = step["step"]
+            thoughts = step.get("thoughts", "")
+            action = step.get("action", "")
+
+            execution_context += f"\nStep {step_num}:\n"
+            if thoughts:
+                execution_context += f"Thoughts: {thoughts}\n"
+            execution_context += f"Code: {action}\n"
+
+        # Create summary prompt with same context as coding agent
+        summary_prompt = f"""
+{execution_context}
+
+Please provide a concise summary of the code execution session. Focus on:
+
+1. The code logic implemented at each step
+2. The outputs and results produced by each code execution
+3. The progression of the solution approach
+
+Do not make judgments about success or failure. Simply describe what was attempted and what resulted.
+
+Keep the summary under 150 words and use clear, factual language.
+"""
+
+        # Generate summary using LLM with dedicated summary system prompt
+        try:
+            summary_agent = LMMAgent(
+                engine_params=self.engine_params,
+                system_prompt=PROCEDURAL_MEMORY.CODE_SUMMARY_AGENT_PROMPT,
+            )
+            summary_agent.add_message(summary_prompt, role="user")
+            summary = call_llm_safe(summary_agent, temperature=self.temperature)
+
+            if not summary or summary.strip() == "":
+                summary = "Summary generation failed - no response from LLM"
+                logger.warning("Summary generation failed - empty response from LLM")
+
+        except Exception as e:
+            summary = f"Summary generation failed: {str(e)}"
+            logger.error(f"Error generating summary: {e}")
+
+        return summary
--- a/mm_agents/os_symphony/agents/grounder_agent.py
+++ b/mm_agents/os_symphony/agents/grounder_agent.py
@@ -0,0 +1,109 @@
+import re
+from typing import Any, Dict, List
+
+import pytesseract
+from PIL import Image
+import io
+from mm_agents.os_symphony.core.mllm import LMMAgent
+from mm_agents.os_symphony.utils.common_utils import call_llm_safe, smart_resize
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+import logging
+
+logger = logging.getLogger("desktopenv.agent")
+
+class GrounderAgent:
+    """
+    Class designed for interacting with GUI, serving for Grounding Agent and VLMSearcher
+    """
+    def __init__(self, engine_params: Dict, screen_width: int, screen_height: int):
+        self.engine_params_for_grounder = engine_params # grounder_params
+        system_prompt, self.user_message = PROCEDURAL_MEMORY.construct_grounder_procedural_memory(model_name=engine_params["model"])
+        self.grounding_model = LMMAgent(engine_params, system_prompt=system_prompt)
+        # Width and height for Grounding Agent!
+        self.width = engine_params['grounding_width']
+        self.height = engine_params['grounding_height']
+        print(f"[Grounder]: initialized width is {self.width}, height is {self.height}")
+        # Width and height for actual screen!
+        self.screen_width = screen_width
+        self.screen_height = screen_height
+
+    # Given the state and worker's referring expression, use the grounding model to generate (x,y)
+    def generate_coords(self, ref_expr: str, obs: Dict, detail=False, expansion_pixels=400, **kwargs) -> List:
+        cur_screenshot = obs["screenshot"]
+        
+        # store global offset
+        global_offset_x = 0
+        global_offset_y = 0
+        
+        # final coordinates for output
+        final_global_x = 0
+        final_global_y = 0
+
+        cur_width, cur_height = self.screen_width, self.screen_height
+        
+        print(f"[Grounder] start to ground!")
+        self.grounding_model.reset()
+
+        # Configure the context
+        prompt = self.user_message.replace("REF_EXPR", ref_expr)
+
+        # cosistent with the system prompt presented in the paper of GTA-1
+        if 'gta' in self.engine_params_for_grounder['model']:
+            self.grounding_model.add_system_prompt("You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.")
+        
+        self.grounding_model.add_message(
+            text_content=prompt, image_content=cur_screenshot, put_text_last=True, role="user"
+        )
+
+        # Generate and parse coordinates
+        response = call_llm_safe(self.grounding_model, temperature=0.05, **kwargs)
+        print(f"[Grounder] prompt: {prompt}\nmodel: {self.engine_params_for_grounder['model']}, \nresponse: {response}")
+
+
+        # 1. highest priority: (x1="...", y1="...", x="...", y="...")
+        numericals = re.findall(r'(?:x1|y1|x|y)=["\']?(\d+)["\']?', response)
+        # 2. second highest priority: just like <points>653 42</points> or [653, 42]
+        if len(numericals) < 2:
+            clean_response = re.sub(r'[xXyY]\d', '', response)
+            numericals = re.findall(r'\d+', clean_response)
+        assert len(numericals) >= 2
+        
+        print(f"[Grounder] the parsed coordinates: {numericals}")
+
+        local_x, local_y = self._resize_coordinates([int(numericals[0]), int(numericals[1])], width=cur_width, height=cur_height)
+        
+        # current global coordinates = local ordinates + global offset 
+        final_global_x = local_x + global_offset_x
+        final_global_y = local_y + global_offset_y
+
+        if detail:
+            return [cur_screenshot, global_offset_x, global_offset_y]
+        else:
+            return [final_global_x, final_global_y]
+    
+    def dynamic_set_width_height(self, width: int, height: int):
+        self.width = width
+        self.height = height
+        
+    # Resize from grounding model dim into OSWorld dim (1920 * 1080)
+    def _resize_coordinates(self, coordinates: List[int], width:int, height:int) -> List[int]:
+        """
+            width, height: for current observation
+            grounding_width, grounding_height: width and height for Grounding model 1000x1000 or 1280x800)
+        """
+        grounding_width = self.engine_params_for_grounder["grounding_width"]
+        grounding_height = self.engine_params_for_grounder["grounding_height"]
+        grounding_smart_resize = self.engine_params_for_grounder["grounding_smart_resize"]
+
+        
+        if not grounding_smart_resize:
+            return [
+                round(coordinates[0] * width / grounding_width),
+                round(coordinates[1] * height / grounding_height),
+            ]
+        else:
+            smart_height, smart_width = smart_resize(height, width)
+            return [
+                round(coordinates[0] * width / smart_width),
+                round(coordinates[1] * height / smart_height)
+            ]
--- a/mm_agents/os_symphony/agents/memoryer_agent.py
+++ b/mm_agents/os_symphony/agents/memoryer_agent.py
@@ -0,0 +1,428 @@
+from ast import parse
+import logging
+import json
+from typing import List, Dict, Any, Optional, Tuple
+from mm_agents.os_symphony.utils.common_utils import (
+    call_llm_formatted,
+    enhance_observation,
+    parse_code_from_string
+)
+from functools import partial
+from mm_agents.os_symphony.utils.formatters import JSON_ANSWER_FORMATTER
+from mm_agents.os_symphony.core.mllm import LMMAgent
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+import imagehash
+import io
+import os
+from PIL import Image
+import numpy as np
+from skimage.metrics import structural_similarity as ssim
+
+logger = logging.getLogger("desktopenv.agent")
+
+
+class StepBehavior:
+    """
+    Narrative Step Behavior.
+    Description of each step, cosists of generative agent (main agent)'s output, screenshot (if this step is milestone), and textual description.
+    The textual description shows that how the agent thought and did, and how the state changes. 
+    """
+    def __init__(self, is_milestone: bool, gen_output: str, summary: str, obs: Dict, action_dict: Dict):
+        self.is_milestone = is_milestone
+        self.gen_output = gen_output
+        self.obs = obs
+        self.summary = summary
+        self.action_dict = action_dict
+        # Variants for opyimizing the time complexity of loop detection
+        # --- 1. pHash ---
+        self.phash = None
+        # --- 2. SSIM ---
+        self.ssim_list = []
+    
+    def _update_phash_ssim(self, history: List):
+        # Calculate the ssim_list of current obs
+        # Update pHash
+        cur_img = Image.open(io.BytesIO(self.obs["screenshot"]))
+        cur_img_gray = cur_img.convert('L')
+        cur_img_np = np.array(cur_img_gray)
+        self.phash = imagehash.phash(cur_img)
+        # Update ssim_list
+        for hs in history:
+            compare_img = Image.open(io.BytesIO(hs.obs["screenshot"]))
+            compare_img_gray = compare_img.convert('L')
+            compare_img_np = np.array(compare_img_gray)
+            self.ssim_list.append(ssim(cur_img_np, compare_img_np, data_range=cur_img_np.max() - compare_img_np.min()))
+
+class ReflectionMemoryAgent:
+    """
+    Reflection Memory Agent (RMA).
+    Responsible for maintaining long-term memory, extracting narratives from trajectories,
+    providing reflections to the Main Agent, and validating task completion status.
+    """
+    def __init__(self, engine_params: Dict):
+        """
+        Initialize the RMA.
+
+        Args:
+        - engine_params: 
+            {
+                "engine_type": args.provider,
+                "model": args.model,
+                "base_url": args.model_url,
+                "api_key": args.model_api_key,
+                "temperature": getattr(args, "model_temperature", None),
+            }
+        - max_img_len: max image number to use in reflection process
+        """
+
+        self.engine_params = engine_params
+
+        self.max_images = engine_params.get('max_images', 8)
+
+        self.memoryer_level = engine_params.get('memoryer_level', 3)
+        
+        self.reset()
+
+        logger.info(f"ReflectionMemoryAgent initialized with:\n {self.engine_params}")
+        
+
+    def reset(self):
+        """Reset the code agent state."""
+        logger.debug("Resetting RMA state")
+
+        self.instruction = None
+
+        self.trajectory: List[StepBehavior] = []
+
+        self.knowledge_base: List[str] = []
+
+        self.last_code_step_idx = -1        
+
+        '''
+        Control the count of images, we only use the maximum number of max_img_len images.
+        The update logic: the 0-th screenshot is always retained. If the total number of screenshots is less than max_img_len, all are kept; otherwise, starting from index 1, milestone screenshots are managed via FIFO.
+        '''
+        self.active_img_idx = []        
+
+        self.reflection_agent = LMMAgent(
+            engine_params=self.engine_params,
+            system_prompt=PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT,
+        )
+        self.behavior_agent = LMMAgent(
+            engine_params=self.engine_params,
+            system_prompt=PROCEDURAL_MEMORY.SUMMARIZE_STEP_SYSTEM_PROMPT
+        )
+    
+    def add_instruction(self, instruction):
+        """
+        [Interface] Main -> RMA
+        Main agent set the instruction to RMA.
+        """
+        self.instruction = instruction
+
+    def _update_trajectory(self, step_behavior):
+        self.trajectory.append(step_behavior)
+        if len(self.active_img_idx) >= self.max_images:
+            if step_behavior.is_milestone:
+                self.active_img_idx.append(len(self.trajectory) - 1)      # over max_img_len，only milestone image
+                del self.active_img_idx[1]          # FIFO starts from index 1
+        else:
+            self.active_img_idx.append(len(self.trajectory) - 1)        # less than max_img_len, feed all images
+            
+        assert len(self.active_img_idx) <= self.max_images, "[RMA] Errors in updating StepBehavior!!"
+
+    def _summarize_step_behavior(
+            self, 
+            generator_output: str, 
+            cur_obs: Dict, 
+            enhanced_obs: bytes | None, 
+            is_milestone: bool,
+            mode: str = "gui",
+            code_exec_summary: str = "",
+            action_dict: Dict = {}
+        ) -> Tuple[StepBehavior, bool]:
+        """
+        [Interface] Main -> RMA
+        The Main Agent (MA) calls this method to "feed" the information of the just-completed step to the RMA.
+        RMA will internally process and store this step.
+        """
+
+        if mode == "search":
+            is_success = "successful"
+            # summary is fixed
+            step_behavior = StepBehavior(
+                False, 
+                generator_output,
+                "Search Agent was called last step, and a tutorial has been generated.", 
+                cur_obs,
+                action_dict
+            )
+        elif mode == "code":
+            self.last_code_step_idx = len(self.trajectory)
+
+            is_success = "successful"
+            # the summary returned by the code agent
+            step_behavior = StepBehavior(
+                False, 
+                generator_output,
+                f"Code Agent was called last step, and the summary of its trajectory is: \n---\n{code_exec_summary}\n---", 
+                cur_obs,
+                action_dict
+            )
+        else:       # common gui operation, use LLM to summarize
+            prev_obs = self.trajectory[-1].obs
+
+            text_content = f"""Computer Use Agent's Output: \n{generator_output}"""
+            
+            
+            self.behavior_agent.reset()     # don't need history messages
+            
+            updated_sys_prompt = (
+                self.behavior_agent.system_prompt + "\n" + text_content
+            )
+            self.behavior_agent.add_system_prompt(updated_sys_prompt)
+
+            self.behavior_agent.add_message(
+                text_content="This is the observation before executing action (attached below).",
+                image_content=prev_obs['screenshot'],
+                role="user",
+                put_text_last=False
+            )
+            self.behavior_agent.add_message(
+                text_content="This is the zoom-in view, which may help you to identify the operational region (attached below).",
+                image_content=enhanced_obs,
+                role="user",
+                put_text_last=False
+            )
+            self.behavior_agent.add_message(
+                text_content="This is the observation after executing action (attached below).",
+                image_content=cur_obs['screenshot'],
+                role="user", 
+                put_text_last=False
+            )
+
+            required_fields = ["summary", "evaluation"]
+            format_checkers = [
+                partial(JSON_ANSWER_FORMATTER, required_fields)
+            ]
+
+            full_response = call_llm_formatted(
+                self.behavior_agent,
+                format_checkers,
+                temperature=self.engine_params.get("temperture", 0.1),
+            )
+
+            response = parse_code_from_string(full_response)
+
+            try:
+                data = json.loads(response)
+                behavior_summary = data['summary']
+                is_success = data["evaluation"]
+            except Exception as e:
+                print("[RMA] Errors in generating step summary: ", e)
+                logger.info("Response is not a JSON object or miss required keys!")
+                behavior_summary = response          
+                is_success = "successful"
+
+
+            step_behavior = StepBehavior(is_milestone, generator_output, behavior_summary, cur_obs, action_dict)
+
+        return step_behavior, is_success == "successful"
+
+    def get_reflection(
+            self, 
+            cur_obs: Dict, 
+            generator_output: str, 
+            coordinates: List, 
+            mode: str="gui", 
+            code_exec_summary: str = "",
+            action_dict: Dict = {}
+        ) -> Dict:
+        """
+        [Interface] RMA -> Main
+        The Main Agent (MA) calls this method to get RMA's reflection before deciding the next action.
+        
+        Args:
+        - cur_obs (Dict): The Main Agent's current observation (o_k).
+        - generator_output (str): The thoughts, screen analysis and action of Main Agent.
+        - coordinates (List): coordinates in the last operation step of Main Agent.
+        - mode(str): [gui, code, search]. Indicate which agent that main agent called last step.
+        - code_exec_summary: execution summary for code agent.
+        - action_dict: extracted action from generator output.
+        
+        Returns:
+        - reflection_info(Dict): all the info related to reflection
+        """   
+        if self.memoryer_level == 0:
+            return {
+                "reflection": None,
+                "reflection_thoughts": None,
+                "existing_knowledge": None,
+                "is_milestone": False,
+                "new_knowledge": None,
+                "step_summary": None,
+                "hint": {
+                    "gui_operation_error": False,
+                    "lack_of_tutorial": False,
+                    "code_error": False,
+                    "loop_detection": None,
+                }
+            } 
+
+        reflection = None
+        reflection_thought = None
+        if len(self.trajectory) == 0:
+            step_behavior = StepBehavior(
+                True, 
+                "The initial screen is provided. No action has been taken yet.",
+                "The initial screen is provided. No action has been taken yet.", 
+                cur_obs,
+                action_dict
+            )
+            step_behavior._update_phash_ssim(self.trajectory)
+            self._update_trajectory(step_behavior)
+            reflection_info = {
+                "reflection": reflection,
+                "reflection_thoughts": reflection_thought,
+                "existing_knowledge": "\n".join(self.knowledge_base),
+                "is_milestone": True,
+                "new_knowledge": "",
+                "step_summary": "",
+                "loop_detection": None
+            } 
+        else: 
+            ### Step Summary
+            prev_obs = self.trajectory[-1].obs
+            enhanced_obs = None
+            if coordinates:
+                enhanced_obs, _, _, _, _ = enhance_observation(
+                    prev_obs["screenshot"], 
+                    coordinates,
+                    draw=True
+                )
+        
+            # generate step behavior
+            step_behavior, last_gui_check = self._summarize_step_behavior(
+                generator_output, 
+                cur_obs, 
+                enhanced_obs, 
+                False, 
+                mode, 
+                code_exec_summary, 
+                action_dict
+            )    
+            step_behavior._update_phash_ssim(self.trajectory)
+            
+            ### make additional hints
+            additional_hints = []
+            if not last_gui_check:
+                additional_hints.append(f"\t- Warning: The last GUI operation is unsuccessful. Careful review is required to avoid GUI Operation Error.")
+
+            code_error_hint = False
+
+            if self.last_code_step_idx != -1 and len(self.trajectory) - self.last_code_step_idx < 0:
+                code_error_hint = True
+                additional_hints.append(f"\t- Warning: The Computer Use Agent might in the verification stage of Code Agent. Careful review is required to avoid Code Error.")
+                
+            # loop detection
+            from mm_agents.os_symphony.utils.loop_detection import detect_loop
+            is_loop, loop_details = detect_loop(full_trajectory=self.trajectory, N=3)
+            if is_loop and loop_details:
+                match_sequence_indices = loop_details["match_sequence_indices"]
+                loop_hint_message = f"\t- Warning: A potential LOOP has been detected between Step {match_sequence_indices[0]} and Step {match_sequence_indices[-1]}. Careful review is required to avoid Repetitive Behavior Error."
+                additional_hints.append(loop_hint_message)
+
+            self.reflection_agent.reset()
+
+            updated_sys_prompt = (
+                PROCEDURAL_MEMORY.REFLECTION_SYSTEM_PROMPT + "\n\n" + 
+                f"---\n- **user instruction**: {self.instruction}\n" + 
+                "- **existing knowledge**: \n" + "\n".join(self.knowledge_base) + 
+                "\n- **additional_hints**: " + "\n".join(additional_hints) + "\n---"
+            )
+
+            # update system prompt
+            self.reflection_agent.add_system_prompt(updated_sys_prompt)
+
+            
+            for i, step in enumerate(self.trajectory):
+                text_content = f"""### (Step {i}) history:\nsummary: '''\n{step.summary}\n'''"""
+                if i in self.active_img_idx:
+                    if i == 0:
+                        text_content += f"\ninitial screenshot:"
+                    else: 
+                        text_content += f"\nscreenshot (after executing action): (attached below)"
+
+                self.reflection_agent.add_message(
+                    text_content=text_content,
+                    image_content=step.obs['screenshot'] if i in self.active_img_idx else None,     
+                    role="user",
+                    put_text_last=False
+                )
+                    
+            text_content = f"""### (Last Step) CUA's output (has been finished):\n---\n{generator_output}\n---\nStep Summary:\n---\n{step_behavior.summary}\n---\nlatest_screenshot:  (attached below)"""
+            self.reflection_agent.add_message(
+                text_content=text_content,
+                image_content=cur_obs['screenshot'],
+                role="user",
+                put_text_last=False
+            )
+            
+            required_fields = ["is_milestone", "reflection", "knowledge"]
+        
+            format_checkers = [
+                partial(JSON_ANSWER_FORMATTER, required_fields)
+            ]
+
+            full_response = call_llm_formatted(
+                self.reflection_agent,
+                format_checkers
+            )
+
+
+            reflection_thought = full_response    
+
+            response = parse_code_from_string(full_response)
+            
+            try:
+                data = json.loads(response)
+                reflection = data['reflection']
+                is_milestone = data["is_milestone"]
+                knowledge = data['knowledge']
+            except Exception as e:
+                print("[RMA] Errors in dealing with reflection: ", e)
+                logger.info("Response is not a JSON object or miss required keys!")
+                reflection = response       
+                is_milestone = False
+                knowledge = ""
+
+            if len(knowledge) > 0:
+                self.knowledge_base.append(knowledge)
+            
+            if isinstance(is_milestone, str):
+                is_milestone = True if "true" in is_milestone.lower() else False
+            
+            # update trajectory and is_milestone
+            self._update_trajectory(step_behavior)
+            if mode == "gui":       # only gui opration can be considered as milestone
+                self.trajectory[-1].is_milestone = is_milestone
+            
+
+            reflection_info = {
+                "reflection": reflection,
+                "reflection_thoughts": reflection_thought,
+                "existing_knowledge": "\n".join(self.knowledge_base),
+                "is_milestone": is_milestone,
+                "new_knowledge": knowledge,
+                "step_summary": step_behavior.summary,
+                "hint": {
+                    "gui_operation_error": not last_gui_check,
+                    "lack_of_tutorial": is_loop,
+                    "code_error": code_error_hint, 
+                    "loop_detection": loop_details,
+                }
+            }
+            
+        return reflection_info
+    
+    
--- a/mm_agents/os_symphony/agents/ocr.py
+++ b/mm_agents/os_symphony/agents/ocr.py
@@ -0,0 +1,178 @@
+import re
+from io import BytesIO
+from typing import Tuple, List, Dict
+from PIL import Image, ImageDraw, ImageFont
+import numpy as np
+import pytesseract
+from pytesseract import Output
+import easyocr
+
+
+class OCRProcessor:
+    """
+    OCR Processor supports Tesseract and EasyOCR
+    """
+    def __init__(self, use_gpu: bool = False, languages: List[str] = ['en']):
+        """
+        Initialize processor
+        
+        Args:
+            use_gpu (bool): whether EasyOCR need to use gpu
+            languages (List[str]): language list that EasyOCR, e.g. ['en', 'ch_sim']。
+        """
+        self.use_gpu = use_gpu
+        self.languages = languages
+        self.reader = None # lazy-load EasyOCR Reader
+
+    def _get_easyocr_reader(self):
+        if self.reader is None:
+            print(f"Loading EasyOCR model (GPU={self.use_gpu})...")
+            self.reader = easyocr.Reader(self.languages, gpu=self.use_gpu)
+        return self.reader
+
+    def get_ocr_elements(self, bytes_image_data: bytes, mode: str = 'tesseract') -> Tuple[str, List[Dict]]:
+        """
+        Executes OCR recognization.
+
+        Args:
+            bytes_image_data (str): image in Base64 
+            mode (str): 'tesseract' (faster) or 'easyocr' (more precise)。
+
+        Returns:
+            Tuple[str, List]: (textual table string, list of element details)
+        """
+        try:
+            image = Image.open(BytesIO(bytes_image_data))
+        except Exception as e:
+            print(f"Error decoding or opening image: {e}")
+            return "", []
+
+        if mode == 'tesseract':
+            return self._process_tesseract(image)
+        elif mode == 'easyocr':
+            return self._process_easyocr(image)
+        else:
+            raise ValueError(f"Unknown mode: {mode}. Use 'tesseract' or 'easyocr'.")
+
+    def _process_tesseract(self, image: Image.Image) -> Tuple[str, List[Dict]]:
+        """Tesseract processing"""
+        data = pytesseract.image_to_data(image, output_type=Output.DICT)
+        
+        ocr_elements = []
+        ocr_table = "Text Table (Tesseract):\nWord id\tText\n"
+        ocr_id = 0
+
+        num_boxes = len(data['text'])
+        for i in range(num_boxes):
+            # filter text with low confidence
+            if int(data['conf'][i]) > 0 and data['text'][i].strip():
+                clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", data['text'][i])
+                if not clean_text: continue
+
+                ocr_table += f"{ocr_id}\t{clean_text}\n"
+                
+                ocr_elements.append({
+                    "id": ocr_id,
+                    "text": clean_text,
+                    "mode": "tesseract",
+                    "left": data["left"][i],
+                    "top": data["top"][i],
+                    "width": data["width"][i],
+                    "height": data["height"][i],
+                    "conf": data["conf"][i]
+                })
+                ocr_id += 1
+        
+        return ocr_table, ocr_elements
+
+    def _process_easyocr(self, image: Image.Image) -> Tuple[str, List[Dict]]:
+        """EasyOCR processing"""
+        reader = self._get_easyocr_reader()
+        
+        image_np = np.array(image)
+        
+        # detail=1 means returning (bbox, text, conf)
+        results = reader.readtext(image_np, detail=1, paragraph=False, width_ths=0.1)
+        
+        ocr_elements = []
+        ocr_table = "Text Table (EasyOCR):\nWord id\tText\n"
+        ocr_id = 0
+        
+        for (bbox, text, conf) in results:
+            clean_text = re.sub(r"^[^a-zA-Z0-9\s.,!?;:\-\+]+|[^a-zA-Z0-9\s.,!?;:\-\+]+$", "", text)
+            if not clean_text.strip(): continue
+
+            # EasyOCR returns [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]
+            # we convert them into left, top, width, height
+            (tl, tr, br, bl) = bbox
+            tl = [int(v) for v in tl]
+            br = [int(v) for v in br]
+            
+            left = min(tl[0], bl[0])
+            top = min(tl[1], tr[1])
+            right = max(tr[0], br[0])
+            bottom = max(bl[1], br[1])
+            
+            width = right - left
+            height = bottom - top
+            # ---------------
+
+            ocr_table += f"{ocr_id}\t{clean_text}\n"
+            
+            ocr_elements.append({
+                "id": ocr_id,
+                "text": clean_text,
+                "mode": "easyocr",
+                "left": left,
+                "top": top,
+                "width": width,
+                "height": height,
+                "conf": float(conf)
+            })
+            ocr_id += 1
+
+        return ocr_table, ocr_elements
+
+    @staticmethod
+    def visualize_ocr_results(image_path: str, ocr_elements: List[Dict], output_path: str):
+        """
+        Draw bounding boxes and IDs on the original image.
+        """
+        try:
+            image = Image.open(image_path).convert("RGB")
+            draw = ImageDraw.Draw(image)
+
+            try:
+                font = ImageFont.truetype("arial.ttf", 16)
+            except IOError:
+                font = ImageFont.load_default()
+
+            for element in ocr_elements:
+                left, top = element["left"], element["top"]
+                width, height = element["width"], element["height"]
+                
+                color = "green" if element.get("mode") == "easyocr" else "red"
+                
+                draw.rectangle([(left, top), (left + width, top + height)], outline=color, width=2)
+                
+                text_str = str(element["id"])
+                
+                if hasattr(draw, "textbbox"):
+                    bbox = draw.textbbox((0, 0), text_str, font=font)
+                    text_w, text_h = bbox[2]-bbox[0], bbox[3]-bbox[1]
+                else:
+                    text_w, text_h = draw.textsize(text_str, font=font)
+                
+                label_bg = [left, top - text_h - 4, left + text_w + 4, top]
+                draw.rectangle(label_bg, fill=color)
+                
+                draw.text((left + 2, top - text_h - 4), text_str, fill="white", font=font)
+
+            image.save(output_path)
+            print(f"Visualization saved to: {output_path}")
+
+        except FileNotFoundError:
+            print(f"Error: Image {image_path} not found.")
+        except Exception as e:
+            print(f"Visualization error: {e}")
+
--- a/mm_agents/os_symphony/agents/os_aci.py
+++ b/mm_agents/os_symphony/agents/os_aci.py
@@ -0,0 +1,575 @@
+import re
+from typing import Any, Dict, List, Optional, Tuple
+
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+from mm_agents.os_symphony.core.mllm import LMMAgent
+from mm_agents.os_symphony.utils.common_utils import call_llm_safe
+from mm_agents.os_symphony.agents.coder_agent import CoderAgent
+from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
+from mm_agents.os_symphony.agents.searcher_agent import SearcherAgent
+import logging
+from mm_agents.os_symphony.agents.ocr import OCRProcessor
+
+
+logger = logging.getLogger("desktopenv.agent")
+
+# Agent action decorator
+def agent_action(func):
+    func.is_agent_action = True
+    return func
+
+# GrounderAgent primitives are parameterized by description, and coordinate generation uses a pretrained grounding model
+class OSACI:
+    def __init__(
+        self,
+        env,
+        search_env,
+        platform: str,
+        client_password: str,
+        engine_params_for_ocr: Dict,
+        engine_params_for_grounder: Dict,
+        engine_params_for_coder: Dict,
+        engine_params_for_searcher: Dict,
+        screen_width: int = 1920,
+        screen_height: int = 1080
+    ):
+
+        self.env = env
+        self.platform = platform
+        self.client_password = client_password
+
+        self.result_dir = ""
+        
+        self.grounder_agent = GrounderAgent(engine_params=engine_params_for_grounder, screen_width=screen_width, screen_height=screen_height)
+        
+        # Configure text grounding agent
+        self.ocr_processor = OCRProcessor()
+        self.text_span_agent = LMMAgent(
+            engine_params=engine_params_for_ocr,
+            system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,
+        )
+
+        # Configure code agent
+        self.coder_agent = CoderAgent(
+            engine_params=engine_params_for_coder,
+            platform=self.platform,
+            client_password=client_password
+        )
+
+        # Configure search agent
+        self.searcher_agent = SearcherAgent.create(
+            engine_params=engine_params_for_searcher, 
+            search_env=search_env, 
+            grounder_agent=self.grounder_agent, 
+            platform=self.platform,
+            client_password=self.client_password
+        )
+
+        # Store task instruction for code agent
+        self.current_task_instruction = None
+        self.last_code_agent_result = None
+        self.last_search_agent_result = None
+        self.notes: List[str] = []
+        # Tutorial should be a global info, not a local context, so how to add it to the global info
+        self.tutorials = []
+
+
+    def assign_screenshot(self, obs):
+        self.obs = obs
+
+    # Given the state and worker's text phrase, generate the coords of the first/last word in the phrase
+    def generate_text_coords(
+        self, phrase: str, obs: Dict, alignment: str = ""
+    ) -> List[int]:
+ 
+        screenshot, global_offset_x, global_offset_y= obs["screenshot"], 0, 0
+
+        ocr_table, ocr_elements = self.ocr_processor.get_ocr_elements(screenshot, "easyocr")
+
+        alignment_prompt = ""
+        if alignment == "start":
+            alignment_prompt = "**Important**: Output the word id of the FIRST word in the provided phrase.\n"
+        elif alignment == "end":
+            alignment_prompt = "**Important**: Output the word id of the LAST word in the provided phrase.\n"
+
+        # Load LLM prompt
+        self.text_span_agent.reset()
+        self.text_span_agent.add_message(
+            alignment_prompt + "Phrase: " + phrase + "\n" + ocr_table, role="user"
+        )
+        self.text_span_agent.add_message(
+            "Screenshot:\n", image_content=screenshot, role="user"
+        )
+
+        # Obtain the target element
+        response = call_llm_safe(self.text_span_agent)
+        print("TEXT SPAN AGENT RESPONSE:", response)
+        numericals = re.findall(r"\d+", response)
+        if len(numericals) > 0:
+            text_id = int(numericals[-1])
+        else:
+            text_id = 0
+        elem = ocr_elements[text_id]
+
+        # Compute the element coordinates
+        # Note: 0.1 * elem["height"] is used to adjust coordinates to select the last character more precisely.
+        if alignment == "start":
+            coords = [elem["left"], elem["top"] + (elem["height"] // 2)]
+        elif alignment == "end":
+            coords = [elem["left"] + elem["width"] + 0.15 * elem["height"], elem["top"] + (elem["height"] // 2)]
+        
+        print(f'[OCR] output coordinates: {[coords[0] + global_offset_x, coords[1] + global_offset_y]}')
+        return [int(coords[0] + global_offset_x), int(coords[1] + global_offset_y)]
+
+    def set_task_instruction(self, task_instruction: str):
+        """Set the current task instruction for the code agent."""
+        self.current_task_instruction = task_instruction
+
+    @agent_action
+    def click(
+        self,
+        element_description: str,
+        num_clicks: int = 1,
+        button_type: str = "left",
+        hold_keys: List = []
+    ):
+        """Click on the element
+        Args:
+            element_description:str, a detailed descriptions of which element to click on. This description needs to be VERY unambiguous. If the page contains many similar elements, ensure the description uniquely identifies the target element.
+            num_clicks:int, number of times to click the element
+            button_type:str, which mouse button to press can be "left", "middle", or "right"
+            hold_keys:List, list of keys to hold while clicking
+        """
+        x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+
+        command = "import pyautogui; "
+
+        for k in hold_keys:
+            command += f"pyautogui.keyDown({repr(k)}); "
+        command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
+        for k in hold_keys:
+            command += f"pyautogui.keyUp({repr(k)}); "
+        # Return pyautoguicode to click on the element
+
+        action = {"function": "click", "args": {"x": x, "y": y, "button": button_type, "clicks": num_clicks}}
+        return (command, action)
+
+    @agent_action
+    def open(self, app_or_filename: str):
+        """Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
+        Args:
+            app_or_filename:str, the name of the application or filename to open
+        
+        **Important**: 
+        Provide only the name of the application or file. Do not include the full path (e.g., "/home/user/Desktop/my_report.docx"). The function works by searching for the name, not by accessing a file path directly.
+        """
+        action = {"function": "open", "args": {"name": app_or_filename}}
+        if self.platform == "linux":
+            return (f"import pyautogui; pyautogui.hotkey('win'); time.sleep(1.0); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(1.0)", action)
+        elif self.platform == "macos":
+            return (f"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)", action)
+        elif self.platform == "windows":
+            return (f"import pyautogui; import time; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.press('enter'); time.sleep(0.5)", action)
+        else:
+            assert (
+                False
+            ), f"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows."
+    
+    def _paste(self, is_terminal):
+        if self.platform == 'macos':
+            return "pyautogui.hotkey('command', 'v');"
+        
+        elif self.platform == 'linux':
+            if is_terminal:
+                return "pyautogui.hotkey('ctrl', 'shift', 'v');"
+            else:
+                return "pyautogui.hotkey('ctrl', 'v');"
+                
+        elif self.platform == 'windows':
+            return "pyautogui.hotkey('ctrl', 'v');"
+        
+        return ""
+    
+    def _clear_all(self, is_terminal):
+        """
+        Clean the content of current line
+        """
+        # common apps in GUI
+        if not is_terminal:
+            if self.platform == 'macos':
+                # macOS GUI: Command + A -> Backspace
+                return "pyautogui.hotkey('command', 'a'); pyautogui.press('backspace');"
+            else:
+                # Windows/Linux GUI: Ctrl + A -> Backspace
+                return "pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace');"
+
+        # terminal
+        else:
+            if self.platform == 'windows':
+                return "pyautogui.press('esc');"
+            else:
+                return "pyautogui.hotkey('ctrl', 'e'); pyautogui.hotkey('ctrl', 'u');"
+    
+    def _type(
+        self,
+        text: str,
+        is_terminal: bool
+    ):
+        """
+        use copy and paste to input Chinese, otherwise type normally
+        """
+        commands = ""
+        has_unicode = any(ord(char) > 127 for char in text)
+        if has_unicode and self.platform != "macos":
+            commands += (
+                "original_clipboard = pyperclip.paste();"
+                f"pyperclip.copy({repr(text)});"
+                "time.sleep(0.1);"
+            )
+            commands += self._paste(is_terminal=is_terminal)
+            commands += "pyperclip.copy(original_clipboard);"
+        else:
+            commands += f"pyautogui.write({repr(text)}, interval=0.1);"
+
+        return commands
+    
+    @agent_action
+    def type(
+        self,
+        element_description: str,
+        text: str = "",
+        overwrite: bool = False,
+        enter: bool = False,
+        is_terminal = False
+    ):
+        """Type text/unicode into a specific element
+        Args:
+            element_description: str, a detailed description of which element to enter text in. If provided, the agent will click on this element before typing.
+            text:str, the text to type
+            overwrite:bool, Default is False, assign it to True if the text should overwrite the whole existing text. Using this argument clears all text in an element.
+            enter:bool, Assign it to True if the enter key should be pressed after typing all the text, otherwise assign it to False.
+            is_terminal:bool, (MANDATORY) You MUST set this to True whenever the target you will type into is a terminal.
+        """
+        commands = (
+            "import os;"
+            "import pyautogui;"
+            "import pyperclip;"
+            "import subprocess;"
+            "import time;"
+        )
+
+
+        if self.platform == "linux":
+            commands += (
+                "p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
+                "p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
+                "proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
+                f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
+            )
+
+        x, y = None, None
+        if element_description is not None:
+            x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+            commands += (
+                f"pyautogui.click({x}, {y}, clicks=2);" 
+                f"time.sleep(1.0);"
+                f"pyautogui.click({x}, {y});"
+            )
+
+        if overwrite:
+            commands += self._clear_all(is_terminal=is_terminal)
+
+        commands += self._type(text=text, is_terminal=is_terminal)
+        
+        if enter:
+            commands += "pyautogui.press('enter');"
+
+        if element_description is not None:
+            action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
+        else:
+            action = {"function": "type", "args": {"text": text}}
+        return (commands, action)
+    
+    @agent_action
+    def drag_and_drop(
+        self, starting_description: str, ending_description: str, hold_keys: List = []
+    ):
+        """Drag from the starting description to the ending description
+        Args:
+            starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
+            ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
+            hold_keys:List list of keys to hold while dragging
+        """
+        x1, y1 = self.grounder_agent.generate_coords(starting_description, self.obs)
+        x2, y2 = self.grounder_agent.generate_coords(ending_description, self.obs)
+
+        command = "import pyautogui; "
+
+        command += f"pyautogui.moveTo({x1}, {y1}); "
+        # TODO: specified duration?
+        for k in hold_keys:
+            command += f"pyautogui.keyDown({repr(k)}); "
+        command += f"pyautogui.dragTo({x2}, {y2}, duration=3., button='left'); pyautogui.mouseUp(); "
+        for k in hold_keys:
+            command += f"pyautogui.keyUp({repr(k)}); "
+
+        # Return pyautoguicode to drag and drop the elements
+        action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
+        return (command, action)
+
+    @agent_action
+    def highlight_text_span(
+        self, 
+        starting_phrase: str, 
+        ending_phrase: str, 
+        button: str = "left",
+        text: Optional[str|None] = None
+    ):
+        """Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.
+        Args:
+            starting_phrase: str, the sequence of words that marks the beginning of the text span. Provide a unique sequence of 5 to 10 words.
+            ending_phrase: str, the sequence of words that marks the end of the text span. Provide a unique sequence of 5 to 10 words.
+            button:str, the button to use to highlight the text span. Defaults to "left". Can be "left", "right", or "middle".
+            text: str | None, The text to overwrite the highlighted span with. Providing text here ensures the replacement happens immediately after selection, preventing focus loss.
+        """
+        x1, y1 = self.generate_text_coords(
+            starting_phrase, self.obs, alignment="start"
+        )
+        x2, y2 = self.generate_text_coords(
+            ending_phrase, self.obs, alignment="end"
+        )
+
+        command = "import pyautogui; import time;"
+        command += f"pyautogui.moveTo({x1}, {y1}); "
+        # Click in advance to simulate selecting the text box.
+        command += (
+            f"pyautogui.click({x1}, {y1}, clicks=2);"
+            f"time.sleep(1.0); pyautogui.click({x1}, {y1}); time.sleep(1.0);"
+        )
+        command += f"pyautogui.dragTo({x2}, {y2}, duration=5., button='{button}'); time.sleep(0.5); pyautogui.mouseUp(); "
+
+        if text:
+            if self.platform == "linux":
+                command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
+
+            command += (
+                "original_clipboard = pyperclip.paste();"
+                f"pyperclip.copy({repr(text)});"
+            )
+            command += self._paste(is_terminal=False)
+            command += "pyperclip.copy(original_clipboard);"
+
+        # Return pyautoguicode to drag and drop the elements
+        action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
+        return (command, action)
+    
+    @agent_action
+    def locate_cursor(
+        self,
+        phrase: str,
+        start_or_end: str="start",
+        text: Optional[str|None] = None
+    ):
+        """Click at the beginning or end of a specific text phrase to precisely control cursor positioning. Please prefer using the "click" action in general situations, and use this action only in text-intensive software such as libreoffice_writer, impress, etc.
+
+        Args:
+            phrase: str, The text phrase where you want to position the cursor. Provide a unique sequence of 5 to 10 words. Do NOT use single words unless the total text is extremely short.    
+            start_or_end: str, Whether to click at the "start" (beginning) or "end" (trailing edge) of the identified text phrase. Use "start" to position before the text, "end" to position after it.
+            text: str | None, The text to enter immediately after positioning the cursor. Use this parameter instead of a separate 'type' action to ensure precise input.
+        """
+        x, y = self.generate_text_coords(
+            phrase, self.obs, alignment=start_or_end
+        )
+        command = (
+            "import pyautogui;"
+            "import time;"
+            "import subprocess;"
+            "import pyperclip;" 
+            f"pyautogui.click({x}, {y}, button='left', clicks=2);"
+            "time.sleep(1.0);"
+            f"pyautogui.click({x}, {y}, button='left');"
+        )
+        if text:
+            if self.platform == "linux":
+                command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
+
+            command += self._type(text=text, is_terminal=False)
+
+        if text:
+            action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
+        else:
+            action = {"function": "click", "args": {"x": x, "y": y, "clicks": 1, "button": "left"}}
+        return (command, action)
+
+
+    @agent_action
+    def call_code_agent(self, task: str):
+        """Calls the code agent to execute a well-defined, self-contained goal that can be completed with code.
+
+        Args:
+            task: str, A specific, self-contained goal that the code agent can work on until completion.
+
+        **🚨 CRITICAL GUIDELINES:**
+
+        **Decompose the Main Objective into Logical Goals:**
+        - You **MUST** break down the overall mission into distinct, logical goals or stages.
+        - Your role is to define *what* needs to be done for a specific stage. The code agent's role is to figure out *how* to do it with code.
+        - Pass only one logical goal at a time. The `task` parameter is **REQUIRED**.
+
+        **Define a Self-Contained, Continuous Goal:**
+        - The `task` you provide should be a single, continuous goal. The code agent is capable of handling a multi-step process internally (e.g., opening a file, processing its data, and then saving it) to achieve this one goal.
+        - **Crucially, do not pass a task that combines multiple distinct objectives.** For example, instead of passing "Analyze the sales data, AND email the result," you should first pass the self-contained goal: "Analyze the sales data." After that goal is complete, you can proceed with the next logical goal (e.g., emailing the result) in a subsequent step.
+        - **If unsure, err on the side of caution.** If a task feels like it has two separate parts, break it down and pass only the first part.
+        - Your instruction must describe the desired end-state, NOT the recipe to get there. Do not specify any solution!
+        
+        **Goal Purity is Essential:**
+        - **NEVER** rephrase, paraphrase, or modify the subtask instruction you have decided on. Pass the exact, original wording of the subtask to prevent instruction drift and hallucination.
+
+        Use this for tasks that can be fully accomplished through code execution, particularly for:
+        - Spreadsheet applications: data processing, filtering, sorting, calculations, formulas, data analysis
+        - Document editors: text processing, content editing, formatting, document manipulation
+        - Code editors: code editing, file processing, text manipulation, configuration
+        - Data analysis tools: statistical analysis, data transformation, reporting
+        - File management: bulk operations, file processing, content extraction
+        - System utilities: configuration, setup, automation
+        """
+        logger.info("=" * 50)
+        logger.info("ACI: Calling Code Agent")
+        logger.info("=" * 50)
+        task_to_execute = task
+        logger.info(f"Executing SUBTASK: {task_to_execute}")
+
+        print("obs keys: ", self.obs.keys())
+        screenshot = self.obs.get("screenshot", "") if self.obs else ""
+        logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}")
+
+        logger.info("Executing code agent...")
+
+        result = self.coder_agent.execute(
+            task_to_execute, screenshot, self.env.controller
+        )
+
+        # Store the result for the worker to access
+        self.last_code_agent_result = result
+
+        logger.info("Code agent execution completed")
+        logger.info(f"Result - Completion reason: {result['completion_reason']}")
+        logger.info(f"Steps executed: {result['steps_executed']}")
+        logger.info(f"Summary: {result['summary']}")
+
+        logger.info("=" * 50)
+        logger.info("GROUNDING AGENT: Code Agent Call Finished")
+        logger.info("=" * 50)
+
+        action = {"function": "call_code_agent", "args": {"query": task, "result": True if result["completion_reason"] == "DONE" else False}}
+        # Return code to be executed in the environment
+        return ("import time; time.sleep(2.222)", action)
+
+    @agent_action
+    def scroll(self, element_description: str, clicks: int, shift: bool = False):
+        """Scroll the element in the specified direction
+        Args:
+            element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
+            clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
+            shift:bool, whether to use shift+scroll for horizontal scrolling
+        """
+        x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+        action = {"function": "scroll", "args": {"x": x, "y": y, "clicks": clicks, "shift": shift}}
+        if shift:
+            return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", action)
+        else:
+            return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", action)
+
+    @agent_action
+    def hotkey(self, keys: List):
+        """Press a hotkey combination (can press a single key as well)
+        Args:
+            keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
+        """
+        # add quotes around the keys
+        keys = [f"'{key}'" for key in keys]
+        keys_string = " ".join(keys)
+        action = {"function": "key", "args": {"keys": keys_string}}
+        return (f"import pyautogui; pyautogui.hotkey({', '.join(keys)});", action)
+
+    @agent_action
+    def hold_and_press(self, hold_keys: List, press_keys: List):
+        """Hold a list of keys and press a list of keys
+        Args:
+            hold_keys:List, list of keys to hold
+            press_keys:List, list of keys to press in a sequence
+        """
+
+        press_keys_str = "[" + ", ".join([f"'{key}'" for key in press_keys]) + "]"
+        command = "import pyautogui; "
+        for k in hold_keys:
+            command += f"pyautogui.keyDown({repr(k)}); "
+        command += f"pyautogui.press({press_keys_str}); "
+        for k in hold_keys:
+            command += f"pyautogui.keyUp({repr(k)}); "
+
+        hold_keys_string = " ".join(hold_keys)
+        press_keys_string = " ".join(press_keys)
+        action = {"function": "key", "args": {"keys": hold_keys_string + ";" + press_keys_string}}
+        return (command, action)
+
+    @agent_action
+    def wait(self, time: float):
+        """Wait for a specified amount of time
+        Args:
+            time:float, the amount of time to wait in seconds
+        """
+        return (f"""import time; time.sleep({time});""", {"function": "wait", "args": {}})
+
+    @agent_action
+    def done(
+        self,
+    ):
+        """        
+        End the current task with a success. Use this when you believe the entire task has been fully completed. You must ensure all visual information aligns with the user's true intent.
+        """
+        return ("""DONE""", {"function": "done", "args": {}})
+
+    @agent_action
+    def fail(self):
+        """End the current task with a failure. Use this when you believe the entire task is impossible to complete."""
+        return ("""FAIL""", {"function": "fail", "args": {}})
+    
+    @agent_action
+    def call_search_agent(
+        self, 
+        query: str,
+    ):
+        """
+        Calls a specialized 'Searcher Agent' to find a detailed, step-by-step tutorial on the internet for a specific GUI action.
+        Args:
+            query:str, the search phrase or question for the tutorial. The formulation of this query is critical for success and must follow the guidelines below.
+
+        **Query Formulation Guidelines:**
+
+        Your query must be a well-defined question targeting a **single, specific action** within a **specific application**. To get the best results, adhere to these rules:
+
+        1.  **Start with "How to":** Your query must begin with the phrase "How to" to frame it as a request for instructions.
+        2.  **Include the Application Name:** Always specify the name of the software you are working in (e.g., "GIMP", "Google Chrome", "Libreoffice Writer").
+        3.  **Focus on a Single Intent:** The query should represent one clear goal. Do not combine multiple steps or tasks into one query.
+        4.  **Be Specific, Not Abstract:** Ask a concrete question. Avoid repeating the user's high-level or abstract instructions.
+        5.  **Decompose Complex Tasks:** If the user's overall instruction involves multiple actions (e.g., "download a file and then email it"), and you are stuck on one part, search *only for that specific part*.
+
+        **Examples:**
+
+        *   **User's Overall Instruction:** "Please help me download my latest bank statement and then send it to my accountant."
+            *   **Correct Query (if stuck on downloading):** "How to download a bank statement from the Bank of America website?"
+            *   **Correct Query (if stuck on attaching a file):** "How to attach a file to an email in Gmail?"
+            *   **Incorrect Query:** "Download my bank statement and email it to my accountant" *(This query is too broad, contains multiple sub-tasks, and does not start with "How to".)*
+        """
+        logger.info("=" * 50)
+        logger.info(f"ACI: Calling Search Agent(query={query})")
+        logger.info("=" * 50)
+        self.searcher_agent.result_dir = self.result_dir
+        result = self.searcher_agent.search(query=query, main_obs=self.obs)
+        self.last_search_agent_result = result
+        if result["completion_reason"] == "DONE":
+            self.tutorials.append(result["final_answer"])
+        action = {"function": "call_search_agent", "args": {"query": query, "result": True if result["completion_reason"] == "DONE" else False}}
+        return ("import time; time.sleep(2.222)", action)
+    
--- a/mm_agents/os_symphony/agents/os_symphony.py
+++ b/mm_agents/os_symphony/agents/os_symphony.py
@@ -0,0 +1,61 @@
+import logging
+import platform
+from typing import Dict, List, Tuple
+from mm_agents.os_symphony.agents.os_aci import OSACI
+from mm_agents.os_symphony.agents.searcher_agent import VLMSearcherAgent
+from mm_agents.os_symphony.agents.worker import Worker
+
+logger = logging.getLogger("desktopenv.agent")
+
+class OSSymphony:
+    def __init__(
+        self,
+        engine_params_for_orchestrator: Dict,
+        engine_params_for_memoryer: Dict,
+        os_aci: OSACI,
+        platform: str = platform.system().lower(),
+        client_password: str = "",
+        max_trajectory_length: int = 8,
+        enable_reflection: bool = True,
+    ):
+        """
+        Args:
+            worker_engine_params: Configuration parameters for the worker agent.
+            grounding_agent: Instance of ACI class for UI interaction
+            platform: Operating system platform (darwin, linux, windows)
+            max_trajectory_length: Maximum number of image turns to keep
+            enable_reflection: Creates a reflection agent to assist the worker agent
+        """
+
+        self.engine_params_for_orchestrator = engine_params_for_orchestrator
+        self.engine_params_for_memoryer = engine_params_for_memoryer
+        self.os_aci: OSACI = os_aci
+        self.platform =platform
+        self.client_password = client_password
+        self.max_trajectory_length = max_trajectory_length
+        self.enable_reflection = enable_reflection
+
+    def reset(self, result_dir) -> None:
+        """Reset agent state and initialize components"""
+        # Reset the search time per task
+        self.os_aci.result_dir = result_dir
+        self.executor = Worker(
+            engine_params_for_orchestrator=self.engine_params_for_orchestrator,
+            engine_params_for_memoryer=self.engine_params_for_memoryer,
+            os_aci=self.os_aci,
+            platform=self.platform,
+            client_password=self.client_password,
+            max_trajectory_length=self.max_trajectory_length,
+            enable_reflection=self.enable_reflection,
+        )
+
+    def predict(self, instruction: str, observation: Dict, is_last_step: bool) -> Tuple[Dict, List[str]]:
+        # Initialize the three info dictionaries
+        executor_info, actions = self.executor.generate_next_action(
+            instruction=instruction, obs=observation, is_last_step=is_last_step
+        )
+
+        # concatenate the three info dictionaries
+        info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
+
+        return info, actions
--- a/mm_agents/os_symphony/agents/searcher_agent.py
+++ b/mm_agents/os_symphony/agents/searcher_agent.py
@@ -0,0 +1,478 @@
+import logging
+import urllib.parse
+from typing import Any, Dict, List, Optional
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+from mm_agents.os_symphony.utils.common_utils import (
+    draw_coordinates, 
+    call_llm_formatted,    
+    parse_code_from_string,
+    create_pyautogui_code
+)
+from mm_agents.os_symphony.core.mllm import LMMAgent
+from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
+import os
+import time
+import json
+
+
+logger = logging.getLogger("desktopenv.searcher_agent")
+
+# Agent action decorator
+def searcher_agent_action(func):
+    func.is_searcher_agent_action = True
+    return func
+
+
+# --- Abstract Base Class and Factory ---
+class SearcherAgent:
+    def __init__(self, engine_params: Dict, platform: str):
+        self.engine_params = engine_params
+        self.result_dir = ""
+        self.tutorial_or_hint = ""
+        self.tutorial_notes = []
+        self.max_trajectory_length = 8 
+        self.platform = platform
+        self.budget = engine_params.get("budget", 20)
+
+    @staticmethod
+    def create(engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str="password"):
+        searcher_type = engine_params.get("type", "vlm")
+        if searcher_type == "vlm":
+            return VLMSearcherAgent(engine_params=engine_params, search_env=search_env, grounder_agent=grounder_agent, platform=platform, client_password=client_password)
+        else:
+            raise NotImplementedError
+        
+    def _get_search_time(self) -> int:
+        """for the name of result directory"""
+        if not self.result_dir: return 1
+        search_times: list[int] = []
+        try:
+            if not os.path.exists(self.result_dir): return 1
+            for item_name in os.listdir(self.result_dir):
+                full_path = os.path.join(self.result_dir, item_name)
+                if os.path.isdir(full_path) and item_name.startswith("search_"):
+                    try:
+                        time_val = int(item_name.split('_', 1)[1])
+                        search_times.append(time_val)
+                    except (ValueError, IndexError):
+                        continue
+        except Exception:
+            return 1
+        if not search_times: return 1
+        return max(search_times) + 1
+    
+    def search(self, query: str, obs) -> str:
+        """
+        Args:
+            query: Format like "How to xxxx?", must be a detailed subtask
+            obs: Current screenshot
+        """
+        raise NotImplementedError("Subclasses must implement the 'search' method")
+    
+class VLMSearcherAgent(SearcherAgent):
+    """
+    Start a new, isolated vm, and open chrome in advance
+    """
+    def __init__(self, engine_params: Dict, search_env, grounder_agent: GrounderAgent, platform: str, client_password: str):
+        SearcherAgent.__init__(self, engine_params=engine_params, platform=platform)
+
+        self.grounder_agent = grounder_agent
+        self.client_password = client_password
+        self.env = search_env
+
+        self.use_thinking = engine_params.get("model", "") in [
+            "claude-opus-4-20250514",
+            "claude-sonnet-4-20250514",
+            "claude-3-7-sonnet-20250219",
+            "claude-sonnet-4-5-20250929",
+        ]
+
+        self.engine = engine_params.get("engine", "google")
+
+        # Reuse OSWorld's initialization script to set up Chrome, then directly perform a Google search using the query—currently, the query can be substituted by a placeholder field.
+        self.task_config = {
+            "id": "searcher",
+            "instruction": "searcher",
+            "config": [
+                {
+                    "type": "launch",
+                    "parameters": {
+                        "command": [
+                            "google-chrome",
+                            "--remote-debugging-port=1337"
+                        ]
+                    }
+                },
+                {
+                    "type": "launch",
+                    "parameters": {
+                        "command": [
+                            "socat",
+                            "tcp-listen:9222,fork",
+                            "tcp:localhost:1337"
+                        ]
+                    }
+                },
+                {
+                    "type": "chrome_open_tabs",
+                    "parameters": {
+                        "urls_to_open": [
+                            "GOOGLE_SEARCH_URL"    
+                        ]
+                    }
+                },
+                {
+                    "type": "activate_window",
+                    "parameters": {
+                        "window_name": "Google Chrome"
+                    }
+                }
+            ],
+            "proxy": True
+        }
+        self.obs = None
+
+    def reset(self, query):
+        # When the search function is invoked, a new agent is created; the environment is instantiated only upon the first call, but it must be reset on every invocation.
+        self.tutorial_notes = []
+        self.tutorial_or_hint = ""
+        self.system_prompt = PROCEDURAL_MEMORY.construct_vlm_searcher_procedural_memory(
+            agent_class=type(self)
+        ).replace("CURRENT_OS", self.platform).replace("QUERY", query)
+        self.searcher_agent = LMMAgent(
+            engine_params=self.engine_params,
+            system_prompt=self.system_prompt
+        )
+        self.env.start()
+        # config URL and initialize search environment (google/duckduckgo)
+        search_url = f"https://www.google.com/search?q=" + urllib.parse.quote_plus(query) if self.engine == "google" else f"https://www.duckduckgo.com/?q=" + urllib.parse.quote_plus(query)
+        self.task_config["config"][2]["parameters"]["urls_to_open"][0] = search_url
+        
+        self.env.reset(task_config=self.task_config)
+        print("[Searcher] sleeping...")
+        time.sleep(5)
+
+    def flush_messages(self):
+        """Flush messages based on the model's context limits.
+
+        This method ensures that the agent's message history does not exceed the maximum trajectory length.
+
+        Side Effects:
+            - Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
+        """
+        engine_type = self.engine_params.get("engine_type", "")
+
+        # Flush strategy for long-context models: keep all text, only keep latest images
+        if engine_type in ["anthropic", "openai", "gemini"]:
+            max_images = self.max_trajectory_length
+            for agent in [self.searcher_agent]:
+                if agent is None:
+                    continue
+                # keep latest k images
+                # @Yang: keep the first main agent image
+                img_count = 0
+                for i in range(len(agent.messages) - 1, 1, -1):
+                    for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
+                        if "image" in agent.messages[i]["content"][j].get("type", ""):
+                            img_count += 1
+                            if img_count > max_images:
+                                del agent.messages[i]["content"][j]
+
+        # Flush strategy for non-long-context models: drop full turns
+        else:
+            # generator msgs are alternating [user, assistant], so 2 per round
+            if len(self.searcher_agent.messages) > 2 * self.max_trajectory_length + 1:
+                self.searcher_agent.messages.pop(1)
+                self.searcher_agent.messages.pop(1)
+
+    def assign_screenshot(self, obs):
+        self.obs = obs
+        
+    def search(self, query: str, main_obs):
+        # only create vm when search is called 
+        self.reset(query=query) # reset
+        search_result_dir = os.path.join(self.result_dir, f"search_{self._get_search_time()}")
+        os.makedirs(search_result_dir, exist_ok=True)
+
+        obs = self.env._get_obs() # Get the initial observation
+        step_idx = 0
+        initial_state_text = (
+            "This screenshot shows the current visual context of the main GUI Agent you are assisting. "
+            "Use this image to understand the application, the current view, and the overall environment. "
+            "Your primary goal is to find a tutorial that is highly relevant and well-aligned with this specific context, "
+            "ensuring the instructions you find are applicable to what the main agent is currently seeing."
+        )
+        self.searcher_agent.add_message(
+            text_content=initial_state_text, 
+            image_content=main_obs["screenshot"], 
+            role="user"
+        )
+        execution_history = []
+        completion_reason = ""
+        final_answer = ""
+
+        while step_idx < self.budget:
+            # update system_prompt dynamically
+            tutorial_notes_str = ""
+            if len(self.tutorial_notes) > 0:
+                for i, note in enumerate(self.tutorial_notes, 1):
+                    tutorial_notes_str += f"Tutorial Note {i}: {note}\n\n"
+
+            if step_idx == self.budget - 1:
+                # eager mode
+                self.system_prompt = PROCEDURAL_MEMORY.construct_searcher_eager_mode_procedural_memory(
+                    agent_class=type(self)
+                ).replace("CURRENT_OS", self.platform).replace("QUERY", query)
+            
+            system_prompt = self.system_prompt.replace("TUTORIAL_PLACEHOLDER", tutorial_notes_str)
+            self.searcher_agent.add_system_prompt(system_prompt=system_prompt)
+
+            # start a new turn
+            self.assign_screenshot(obs=obs)
+            generator_message = ""
+
+            self.searcher_agent.add_message(
+                generator_message, image_content=obs["screenshot"], role="user"
+            )
+            format_checkers = []
+
+            # predict action
+            plan = call_llm_formatted(
+                self.searcher_agent,
+                format_checkers,
+                temperature=self.engine_params.get("temperture", 0.1),
+                use_thinking=self.use_thinking,
+            )
+
+            self.searcher_agent.add_message(plan, role="assistant")
+            execution_history.append(plan)
+            logger.info("SEARCHER PLAN:\n %s", plan)
+
+            plan_code = parse_code_from_string(plan)
+            try:
+                assert plan_code, "Plan code should not be empty"
+                # exec_code e.g. import pyautogui; pyautogui.click(1, 2);
+                exec_code, coords = create_pyautogui_code(self, plan_code, obs)
+            except Exception as e:
+                logger.error(
+                    f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
+                )
+                exec_code = self.wait(
+                    1.333
+                )  # Skip a turn if the code cannot be evaluated
+
+            self.flush_messages()
+
+            # execute action
+            action = exec_code
+            logger.info("Step %d: %s", step_idx + 1, action)
+
+            # Save screenshot and trajectory information
+            with open(os.path.join(search_result_dir, f"step_{step_idx + 1}.png"),
+                    "wb") as _f:
+                _f.write(obs['screenshot'])
+
+            if coords is not None and isinstance(coords, list):
+                draw_coordinates(
+                    image_bytes=obs['screenshot'], 
+                    coordinates=coords, 
+                    save_path=os.path.join(search_result_dir, f"step_{step_idx + 1}_draw.png")
+                )
+                            
+            with open(os.path.join(search_result_dir, "traj.jsonl"), "a", encoding="utf-8") as f:
+                f.write(json.dumps({
+                    "query": query,
+                    "step_num": step_idx + 1,
+                    "action": action,
+                    "response": {
+                        "plan": plan,
+                        "plan_code": plan_code,
+                        "coordinates": coords
+                    },
+                    "screenshot_file": f"step_{step_idx + 1}.png"
+                }, ensure_ascii=False))
+                f.write("\n")
+                
+            with open(os.path.join(search_result_dir, f"traj_{step_idx+1}.json"), "w", encoding="utf-8") as f:
+                json.dump({
+                    "query": query,
+                    "step_num": step_idx + 1,
+                    "action": action,
+                    "response": {
+                        "plan": plan,
+                        "plan_code": plan_code,
+                        "coordinates": coords
+                    },
+                    "screenshot_file": f"step_{step_idx + 1}.png"
+                }, f, indent=4, ensure_ascii=False)
+
+            if exec_code in ["DONE", "FAIL"]:
+                # terminate loop
+                completion_reason = exec_code
+                final_answer = self.tutorial_or_hint
+                break
+            else:
+                obs, _, _, _ = self.env.step(action, 5)
+
+            step_idx += 1
+
+        if completion_reason == "":
+            completion_reason = "BUDGET_EXHAUSTED"
+            final_answer = "Sorry, can't get the useful tutorial about the GUI task you provided."
+
+        return {
+            "query": query,
+            "completion_reason": completion_reason,
+            "tutorial_notes": self.tutorial_notes,
+            "execution_history": execution_history,
+            "steps_executed": step_idx,
+            "budget": self.budget,
+            "final_answer": final_answer,
+        }
+    
+    @searcher_agent_action
+    def click(
+        self,
+        element_description: str,
+        num_clicks: int = 1,
+        button_type: str = "left",
+    ):
+        """Click on the element
+        Args:
+            element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.
+            num_clicks:int, number of times to click the element
+            button_type:str, which mouse button to press can be "left", "middle", or "right"
+        """
+        x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+        command = "import pyautogui; "
+        command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
+
+        # Return pyautoguicode to click on the element
+        return (command, [x, y])
+    
+    @searcher_agent_action
+    def type(
+        self,
+        element_description: Optional[str] = None,
+        text: str = "",
+        overwrite: bool = True,
+        enter: bool = False
+    ):
+        """Type text/unicode into a specific element
+        Args:
+            element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.
+            text:str, the text to type
+            overwrite:bool, Default is True, assign it to False if the text should not overwrite the existing text. Using this argument clears all text in an element.
+            enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
+        """
+        commands = (
+            "import os;"
+            "import pyautogui;"
+            "import pyperclip;"
+            "import subprocess;"
+            "import time;"
+            "p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
+            "p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
+            "proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
+            f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
+        )
+
+
+        
+        click_coords = None
+        if element_description is not None:
+            x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+            click_coords = [x, y]
+
+            commands += f"pyautogui.click({x}, {y});"
+
+        if overwrite:
+            commands += (
+                f"pyautogui.hotkey('ctrl', 'a');"
+                "pyautogui.press('backspace');"
+            )
+
+        # use paste to input
+        commands += (
+            "original_clipboard = pyperclip.paste();"
+            f"pyperclip.copy({repr(text)});"
+            "pyautogui.hotkey('ctrl', 'v');"
+            "pyperclip.copy(original_clipboard);"
+        )
+        
+        if enter:
+            commands += "pyautogui.press('enter');"
+
+        if click_coords is not None:
+            return (commands, click_coords)
+        else:
+            return commands
+
+    @searcher_agent_action
+    def scroll(self, element_description: str, clicks: int, shift: bool = False):
+        """Scroll the element in the specified direction
+        Args:
+            element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
+            clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
+            shift:bool, whether to use shift+scroll for horizontal scrolling
+        """
+        x, y = self.grounder_agent.generate_coords(element_description, self.obs)
+
+        if shift:
+            return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", [x, y])
+        else:
+            return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", [x, y])
+
+    @searcher_agent_action
+    def hotkey(self, keys: List):
+        """Press a hotkey combination (can press a single key as well)
+        Args:
+            keys: List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
+        """
+        # add quotes around the keys
+        keys = [f"'{key}'" for key in keys]
+        return f"import pyautogui; pyautogui.hotkey({', '.join(keys)})"
+
+    @searcher_agent_action
+    def save_to_tutorial_notes(self, text: str):
+        """Save high quality and useful information to a long-term knowledge bank for reuse during this search task.
+        Args:
+            text:str, the text to save to the tutorial notes
+        """
+        self.tutorial_notes.append(text)
+        return """WAIT"""
+    
+    @searcher_agent_action
+    def wait(self, time: float):
+        """Wait for a specified amount of time
+        Args:
+            time:float the amount of time to wait in seconds
+        """
+        return f"""import time; time.sleep({time})"""
+
+    @searcher_agent_action
+    def done(
+        self,
+        tutorial: str
+    ):
+        """End the current task with a success. Use this when you believe the entire task has been fully completed.
+        Args:
+            tutorial:str, A detailed, step-by-step tutorial compiled from the search results to be passed to the main agent.
+        """
+        self.tutorial_or_hint = tutorial
+        return """DONE"""
+
+    @searcher_agent_action
+    def fail(
+        self,
+        hint: str
+    ):
+        """End the current task with a failure. Use this when you believe the entire task is impossible to complete.
+        Args:
+            hint:str, A hint or reason explaining why the search failed, or what kind of information was missing.
+        """
+        self.tutorial_or_hint = hint
+        return """FAIL"""
+        
+
+
--- a/mm_agents/os_symphony/agents/worker.py
+++ b/mm_agents/os_symphony/agents/worker.py
@@ -0,0 +1,340 @@
+from functools import partial
+import logging
+from typing import Dict, List, Tuple
+
+from mm_agents.os_symphony.agents.memoryer_agent import ReflectionMemoryAgent
+from mm_agents.os_symphony.agents.os_aci import OSACI
+from mm_agents.os_symphony.core.module import BaseModule
+from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
+from mm_agents.os_symphony.utils.common_utils import (
+    call_llm_formatted,
+    extract_coords_from_action_dict,
+    parse_action_from_string,
+    parse_code_from_string,
+    create_pyautogui_code,
+)
+from mm_agents.os_symphony.utils.formatters import (
+    SINGLE_ACTION_FORMATTER,
+    CODE_VALID_FORMATTER,
+)
+
+
+logger = logging.getLogger("desktopenv.agent")
+
+
+class Worker(BaseModule):
+    def __init__(
+        self,
+        engine_params_for_orchestrator: Dict,
+        engine_params_for_memoryer: Dict,
+        os_aci: OSACI,
+        platform: str,
+        client_password: str,
+        max_trajectory_length: int = 8,
+        enable_reflection: bool = True,
+    ):
+        """
+        Worker receives the main task and generates actions, without the need of hierarchical planning
+        Args:
+            worker_engine_params: Dict
+                Parameters for the worker agent
+            os_aci: Agent
+                The grounding agent to use
+            platform: str
+                OS platform the agent runs on (darwin, linux, windows)
+            max_trajectory_length: int
+                The amount of images turns to keep
+            enable_reflection: bool
+                Whether to enable reflection
+        """
+        super().__init__(platform=platform)
+        self.client_password = client_password
+
+        self.temperature = engine_params_for_orchestrator.get("temperature", 0.0)
+        self.tool_config = engine_params_for_orchestrator.get("tool_config", "")
+        self.use_thinking = engine_params_for_orchestrator.get("model", "") in [
+            "claude-opus-4-20250514",
+            "claude-sonnet-4-20250514",
+            "claude-3-7-sonnet-20250219",
+            "claude-sonnet-4-5-20250929",
+        ]
+        self.engine_params_for_orchestrator = engine_params_for_orchestrator
+        self.engine_params_for_memoryer = engine_params_for_memoryer
+        self.os_aci: OSACI = os_aci
+
+        self.max_trajectory_length = max_trajectory_length if not self.engine_params_for_orchestrator.get("keep_first_image", False) else max_trajectory_length - 1
+        self.enable_reflection = enable_reflection
+        self.reset()
+
+    def reset(self):
+        # set_cell_values only occurs in linux; meanwhile there is no fail option in the other benchmarks
+        if self.platform in ["windows", "macos"]:
+            skipped_actions = ["set_cell_values", "fail"]
+        else:
+            skipped_actions = []
+
+        # Hide code agent action entirely if no env/controller is available
+        if not getattr(self.os_aci, "env", None) or not getattr(
+            getattr(self.os_aci, "env", None), "controller", None
+        ):
+            skipped_actions.append("call_code_agent")
+
+        self.orchestrator_sys_prompt = PROCEDURAL_MEMORY.construct_simple_worker_procedural_memory(
+            agent_class=type(self.os_aci), 
+            skipped_actions=skipped_actions,
+            tool_config=self.tool_config,
+            platform=self.platform
+        ).replace("CURRENT_OS", self.platform).replace("CLIENT_PASSWORD", self.client_password)
+
+        # Worker contains orchestrator and reflection agent
+        self.orchestrator_agent = self._create_agent(
+            engine_params=self.engine_params_for_orchestrator, 
+            system_prompt=self.orchestrator_sys_prompt
+
+        )
+        self.memoryer_agent = ReflectionMemoryAgent(self.engine_params_for_memoryer)
+
+        self.instruction = None
+        self.turn_count = 0
+        self.worker_history = []
+        self.coords_history = []
+
+        # For loop detection
+        self.action_dict_history = []
+
+    def flush_messages(self):
+        """Flush messages based on the model's context limits.
+
+        This method ensures that the agent's message history does not exceed the maximum trajectory length.
+
+        Side Effects:
+            - Modifies the messages of generator, reflection, and bon_judge agents to fit within the context limits.
+        """
+        engine_type = self.engine_params_for_orchestrator.get("engine_type", "")
+
+        # Flush strategy for long-context models: keep all text, only keep latest images
+        if engine_type in ["anthropic", "openai", "gemini", "vllm"]:
+            max_images = self.max_trajectory_length
+            # for agent in [self.generator_agent, self.reflection_agent]:
+            for agent in [self.orchestrator_agent]:
+                if agent is None:
+                    continue
+                # keep latest k images
+                img_count = 0
+                stop_idx = 1 if self.engine_params_for_orchestrator.get("keep_first_image", False) else -1
+                for i in range(len(agent.messages) - 1, stop_idx, -1):
+                    # for j in range(len(agent.messages[i]["content"])):
+                    for j in range(len(agent.messages[i]["content"]) - 1, -1, -1):
+                        if "image" in agent.messages[i]["content"][j].get("type", ""):
+                            img_count += 1
+                            if img_count > max_images:
+                                del agent.messages[i]["content"][j]
+
+        # Flush strategy for non-long-context models: drop full turns
+        else:
+            # generator msgs are alternating [user, assistant], so 2 per round
+            if len(self.orchestrator_agent.messages) > 2 * self.max_trajectory_length + 1:
+                self.orchestrator_agent.messages.pop(1)
+                self.orchestrator_agent.messages.pop(1)
+
+
+    def generate_next_action(self, instruction: str, obs: Dict, is_last_step: bool) -> Tuple[Dict, List]:
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        print("=" * 30, f"Turn {self.turn_count + 1}", "=" * 30)
+        
+        print("=" * 10)
+        print(instruction)
+        print("=" * 10)
+            
+        self.os_aci.assign_screenshot(obs)
+        self.os_aci.set_task_instruction(instruction)
+
+
+        generator_message = (
+            ""
+            if self.turn_count > 0
+            else "The initial screen is provided. No action has been taken yet."
+        )
+
+        
+        # Load the task into the system prompt
+        if is_last_step:
+            # Eager mode: must decide done / fail
+            prompt_with_instructions = PROCEDURAL_MEMORY.construct_eager_mode_procedural_memory(agent_class=type(self.os_aci)).replace(
+                "TASK_DESCRIPTION", instruction
+            ).replace(
+                "CURRENT_OS", self.platform
+            )
+            print(f'Eager Mode Started, Instruction: {prompt_with_instructions}')
+            self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
+            generator_message += "Note: 'EAGER MODE' is enabled. You must determine whether the task is done or fail in this step!!!"
+        else:
+            tutorials = ""
+            for idx, t in enumerate(self.os_aci.tutorials, start=1):
+                tutorials += f"### Tutorial {idx}:\n {t}\n"
+
+            prompt_with_instructions = self.orchestrator_sys_prompt.replace(
+                "TASK_DESCRIPTION", instruction
+            ).replace(
+                "TUTORIAL_PLACEHOLDER", tutorials
+            )
+
+            self.orchestrator_agent.add_system_prompt(prompt_with_instructions)
+        
+        # print(self.orchestrator_agent.system_prompt)
+
+        ### Reflection Part
+        reflection_info = {}
+        if self.enable_reflection:
+            # set instruction to memory agent
+            self.memoryer_agent.add_instruction(instruction)
+            reflection = None
+            # Differentiate the operation mode of last step
+            last_code_summary = ""
+            mode = "gui"
+            if (
+                hasattr(self.os_aci, "last_code_agent_result")
+                and self.os_aci.last_code_agent_result is not None
+            ):
+                # If code agent is called last step, we use its execution result as step behavior. 
+                code_result = self.os_aci.last_code_agent_result
+                mode = "code"
+                last_code_summary += f"Subtask Instruction: {code_result['task_instruction']}\nSteps Completed: {code_result['steps_executed']}\nCompletion Reason: {code_result['completion_reason']}\nExec Summary: {code_result['summary']}\n"
+            
+            if (
+                hasattr(self.os_aci, "last_search_agent_result")
+                and self.os_aci.last_search_agent_result is not None
+            ):
+                mode = "search"
+            # retrieve reflection!!!
+            reflection_info = self.memoryer_agent.get_reflection(
+                cur_obs=obs, 
+                # only use the string after "(next action)" in orchestrator's output
+                generator_output=parse_action_from_string(self.worker_history[-1]) if self.turn_count != 0 else "", 
+                coordinates=self.coords_history[-1] if self.turn_count != 0 else [],
+                mode=mode,
+                code_exec_summary=last_code_summary,
+                action_dict=self.action_dict_history[-1] if self.turn_count != 0 else {}
+            )
+            reflection = reflection_info['reflection']
+            logger.info(f'[Reflection]: {reflection}')
+            if reflection:
+                generator_message += f"REFLECTION: You MUST use this reflection on the latest action:\n{reflection}\n"
+            else:
+                generator_message += "You should go on with your plan.\n"
+        else: 
+            generator_message += "You should go on with your plan.\n"
+
+
+        # Add code agent result from previous step if available (from full task or subtask execution)
+        if (
+            hasattr(self.os_aci, "last_code_agent_result")
+            and self.os_aci.last_code_agent_result is not None
+        ):
+            code_result = self.os_aci.last_code_agent_result
+            generator_message += f"\nCODE AGENT RESULT:\n"
+            generator_message += (
+                f"Task/Subtask Instruction: {code_result['task_instruction']}\n"
+            )
+            generator_message += f"Steps Completed: {code_result['steps_executed']}\n"
+            generator_message += f"Max Steps: {code_result['budget']}\n"
+            generator_message += (
+                f"Completion Reason: {code_result['completion_reason']}\n"
+            )
+            generator_message += f"Summary: {code_result['summary']}\n"
+            generator_message += "\n"
+            # Reset the code agent result after adding it to context
+            self.os_aci.last_code_agent_result = None
+
+        if (
+            hasattr(self.os_aci, "last_search_agent_result")
+            and self.os_aci.last_search_agent_result is not None
+        ):
+            # Retrieve the result dictionary
+            search_result = self.os_aci.last_search_agent_result
+
+            # Add a clear, distinct header for this section in the prompt
+            generator_message += f"\nSEARCH AGENT RESULT:\n"
+            
+            # Add contextual metadata from the search task
+            generator_message += f"Search Query: {search_result['query']}\n"
+            generator_message += f"Search Completion Reason: {search_result['completion_reason']}\n"
+            generator_message += "Search Result: "
+            # Add the most important part: the tutorial found by the agent.
+            # This is given a prominent sub-header so the LLM knows to pay close attention.
+            if search_result["completion_reason"] == "DONE":
+                generator_message += f'Search is completed, the tutorial it found has been already added to your system prompt.\n'
+            elif search_result["completion_reason"] == "FAIL":
+                generator_message += f"Search is fail, the failure reason or the hint is as follow: {search_result['final_answer']}\n"
+        
+            
+            # CRITICAL: Reset the search agent result after adding it to the context.
+            # This prevents it from being added to the prompt again in the next turn.
+            self.os_aci.last_search_agent_result = None
+
+
+        # Finalize the generator message
+        self.orchestrator_agent.add_message(
+            generator_message, image_content=obs["screenshot"], role="user", put_text_last=True
+        )
+
+        # Generate the plan and next action
+        format_checkers = [
+            SINGLE_ACTION_FORMATTER,
+            partial(CODE_VALID_FORMATTER, self.tool_config),
+        ]
+        plan = call_llm_formatted(
+            self.orchestrator_agent,
+            format_checkers,
+            temperature=self.engine_params_for_orchestrator.get("temperture", 0.1),
+            use_thinking=self.use_thinking,
+        )
+        self.worker_history.append(plan)
+        self.orchestrator_agent.add_message(plan, role="assistant")
+        logger.info("PLAN:\n %s", plan)
+
+        # Extract the next action from the plan
+        # 此时的plan code e.g. agent.click('xxxxx', 1)
+        plan_code = parse_code_from_string(plan)
+        action_dict, coordinates = None, None
+        try:
+            assert plan_code, "Plan code should not be empty"
+            # exec_code e.g. import pyautogui; pyautogui.click(1, 2);
+            exec_code, action_dict = create_pyautogui_code(self.os_aci, plan_code, obs)
+            coordinates = extract_coords_from_action_dict(action_dict)
+        except Exception as e:
+            logger.error(
+                f"Could not evaluate the following plan code:\n{plan_code}\nError: {e}"
+            )
+            exec_code, action_dict = self.os_aci.wait(
+                1.333
+            )  # Skip a turn if the code cannot be evaluated
+
+        self.action_dict_history.append(action_dict)
+
+        executor_info = {
+            "refined_instruction": self.instruction,
+            "plan": plan,
+            "plan_code": plan_code,
+            "exec_code": exec_code,
+            "coordinates": coordinates,
+            "reflection": reflection_info,
+            "code_agent_output": (
+                self.os_aci.last_code_agent_result
+                if hasattr(self.os_aci, "last_code_agent_result")
+                and self.os_aci.last_code_agent_result is not None
+                else None
+            ),
+            "search_agent_output": (
+                self.os_aci.last_search_agent_result
+                if hasattr(self.os_aci, "last_search_agent_result")
+                and self.os_aci.last_search_agent_result is not None
+                else None
+            )
+        }
+        self.turn_count += 1
+        self.coords_history.append(coordinates)
+        self.flush_messages()
+        return executor_info, [exec_code]