Wxy/opencua (#260)

* OpenCUA Agent code base * update url * debug, modify url input * debug opencua * show result * debug agent history overlap * modify opencua agent; add comment lines
2025-07-16 17:53:12 +08:00
parent 5e5058c1f2
commit 0f2655249c
4 changed files with 497 additions and 197 deletions
--- a/mm_agents/opencua_agent.py
+++ b/mm_agents/opencua_agent.py
@@ -1,38 +1,45 @@
-import base64
-from loguru import logger
+"""
+OpenCUA Agent Implementation
+
+This module implements an OpenCUA agent for desktop automation tasks, building upon
+existing frameworks and integrating multiple coordinate mapping systems.
+
+Framework and Implementation Sources:
+- Main framework structure follows: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/agent.py
+- Agent implementation adapted from: https://github.com/xlang-ai/OSWorld/blob/main/mm_agents/aguvis_agent.py
+- Qwen2.5-VL coordinate mapping from: https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py
+"""
+
 import re
-import time
-import math
-import httpx
-from io import BytesIO
-from typing import Dict, List, Tuple, Optional
-import backoff
-from PIL import Image
 import os
+import ast
+import time
+import json
+import math
+import copy
+import httpx
+import base64
+import backoff
+from io import BytesIO
+from loguru import logger
+from PIL import Image
+from typing import Dict, List, Tuple, Optional

-AGNET_SYS_PROMPT_L1 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}""".strip()
-
+AGNET_SYS_PROMPT_L1 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
 AGNET_SYS_PROMPT_L2 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}".strip()
-
 AGNET_SYS_PROMPT_L3 = "You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.\n\nFor each step, provide your response in this format:\n\nObservation:\n  - Describe the current computer state based on the full screenshot in detail. \n  - Application Context:\n    - The active application\n    - The active window or page\n    - Overall layout and visible interface\n  - Key Elements:\n    - Menu items and toolbars \n    - Buttons and controls\n    - Text fields and content\n    - Dialog boxes or popups\n    - Error messages or notifications\n    - Loading states\n    - Other key elements\n  - Describe any content, elements, options, information or clues that are possibly relevant to achieving the task goal, including their name, content, or shape (if possible).\n\nThought:\n  - Step by Step Progress Assessment:\n    - Analyze completed task parts and their contribution to the overall goal\n    - Reflect on potential errors, unexpected results, or obstacles\n    - If previous action was incorrect, predict a logical recovery step\n  - Next Action Analysis:\n    - List possible next actions based on current state\n    - Evaluate options considering current state and previous actions\n    - Propose most logical next action\n    - Anticipate consequences of the proposed action\n  - For Text Input Actions:\n    - Note current cursor position\n    - Consolidate repetitive actions (specify count for multiple keypresses)\n    - Describe expected final text outcome\n  - Use first-person perspective in reasoning\n\nAction:\n  Provide clear, concise, and actionable instructions:\n  - If the action involves interacting with a specific target:\n    - Describe target explicitly without using coordinates\n    - Specify element names when possible (use original language if non-English)\n    - Describe features (shape, color, position) if name unavailable\n    - For window control buttons, identify correctly (minimize \"—\", maximize \"□\", close \"X\")\n  - if the action involves keyboard actions like 'press', 'write', 'hotkey':\n    - Consolidate repetitive keypresses with count\n    - Specify expected text outcome for typing actions\n\nFinally, output the action as PyAutoGUI code or the following functions:\n- {\"name\": \"computer.triple_click\", \"description\": \"Triple click on the screen\", \"parameters\": {\"type\": \"object\", \"properties\": {\"x\": {\"type\": \"number\", \"description\": \"The x coordinate of the triple click\"}, \"y\": {\"type\": \"number\", \"description\": \"The y coordinate of the triple click\"}}, \"required\": [\"x\", \"y\"]}}\n- {\"name\": \"computer.terminate\", \"description\": \"Terminate the current task and report its completion status\", \"parameters\": {\"type\": \"object\", \"properties\": {\"status\": {\"type\": \"string\", \"enum\": [\"success\", \"fail\"], \"description\": \"The status of the task\"}}, \"required\": [\"status\"]}}\n".strip()

-AGNET_SYS_PROMPT_L0 = """You are a GUI agent. You are given a task and a screenshot of the screen. You need to perform a series of pyautogui actions to complete the task.
-
-For each step, output the action as PyAutoGUI code or the following functions:
- {"name": "computer.triple_click", "description": "Triple click on the screen", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "The x coordinate of the triple click"}, "y": {"type": "number", "description": "The y coordinate of the triple click"}}, "required": ["x", "y"]}}
- {"name": "computer.terminate", "description": "Terminate the current task and report its completion status", "parameters": {"type": "object", "properties": {"status": {"type": "string", "enum": ["success", "failure"], "description": "The status of the task"}}, "required": ["status"]}}
-""".strip()
-
+STEP_TEMPLATE = "# Step {step_num}:\n"
 INSTRUTION_TEMPLATE = "# Task Instruction:\n{instruction}\n\nPlease generate the next move according to the screenshot, task instruction and previous steps (if provided).\n"

-STEP_TEMPLATE = "# Step {step_num}:\n"
 ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
 THOUGHT_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n"
 OBSERVATION_HISTORY_TEMPLATE = "## Observation:\n{observation}\n\n## Thought:\n{thought}\n\n## Action:\n{action}\n"
 DETAIL_HISTORY_TEMPLATE = "## Thought:\n{thought}\n\n## Action:\n{action}\n\n## Code:\n{code}\n"

-# Function to encode the image
+
 def encode_image(image_content):
+    """Encode the image to base64"""
    return base64.b64encode(image_content).decode('utf-8')

 def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type) -> Tuple[str, List[str], dict]:
@@ -40,57 +47,61 @@ def parse_response_to_cot_and_action(input_string, screen_size, coordinate_type)
    try:
        sections = {}

-        if "computer.terminate" in input_string.lower():
-            code_blocks = re.findall(r'```(?:code)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
-            if code_blocks:
-                last_code = code_blocks[-1].strip().lower()
-                if "fail" in last_code:
-                    return "FAIL", ["FAIL"], {}
-                elif "success" in last_code:
-                    return "DONE", ["DONE"], {}
-            
-            return "DONE", ["DONE"], {}
-
        obs_match = re.search(r'^##\s*Observation\s*:?[\n\r]+(.*?)(?=^##\s*Thought:|^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
        if obs_match:
            sections['observation'] = obs_match.group(1).strip()
-        # logger.warning(f"Extracted Observation: {sections.get('observation', 'None')}")

        thought_match = re.search(r'^##\s*Thought\s*:?[\n\r]+(.*?)(?=^##\s*Action:|^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
        if thought_match:
            sections['thought'] = thought_match.group(1).strip()
-        # logger.warning(f"Extracted Thought: {sections.get('thought', 'None')}")

        action_match = re.search(r'^##\s*Action\s*:?[\n\r]+(.*?)(?=^##|\Z)', input_string, re.DOTALL | re.MULTILINE)
        if action_match:
            action = action_match.group(1).strip()
            sections['action'] = action.strip()
-        # logger.warning(f"Extracted Action: {sections.get('action', 'None')}")

-        code_blocks = re.findall(r'```(?:python)?\s*(.*?)\s*```', input_string, re.DOTALL)
+        if "computer.terminate" in input_string.lower():
+            # Look for code blocks that might contain terminate command
+            code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', input_string, re.DOTALL | re.IGNORECASE)
+            if code_blocks:
+                last_code = code_blocks[-1].strip().lower()
+                if "fail" in last_code:
+                    sections['code'] = "FAIL"
+                    return "FAIL", ["FAIL"], sections
+                elif "success" in last_code:
+                    sections['code'] = "DONE"
+                    return "DONE", ["DONE"], sections
+            # Default to DONE if terminate is mentioned but no specific status
+            sections['code'] = "DONE"
+            return "DONE", ["DONE"], sections
+
+        code_blocks = re.findall(r'```(?:python)\s*(.*?)\s*```', input_string, re.DOTALL)
        if code_blocks:
            code = code_blocks[-1].strip()
            sections['original_code'] = transform_agnet_action_to_code_block(code)
            corrected_code = correct_pyautogui_arguments(code)
            sections['code'] = corrected_code
            sections['code'] = project_coordinate_to_absolute_scale(corrected_code, screen_width=screen_size[0], screen_height=screen_size[1], coordinate_type=coordinate_type)
-        # logger.warning(f"Extracted Code: {sections.get('code', 'None')}")
+        else:
+            # No code blocks found
+            sections['code'] = "WAIT"
+            return "WAIT", ["WAIT"], sections

        if 'code' not in sections:
            logger.error("Missing required action or code section")
            return None, None, {}

-        if 'action' not in sections: # TODO: new added
+        if 'action' not in sections:
            sections['action'] = ""

        return sections['action'], [sections['code']], sections
        
    except Exception as e:
        logger.exception(f"Error parsing response: {str(e)}\nInput string: {input_string}")
-        return None, None, {}
-        
+        return None, None, {}  

 def correct_pyautogui_arguments(code: str) -> str:
+    """Correct the pyautogui arguments"""
    function_corrections = {
        'write': {
            'incorrect_args': ['text', 'content'],
@@ -154,6 +165,7 @@ def correct_pyautogui_arguments(code: str) -> str:
    return corrected_code

 def split_args(args_str: str) -> List[str]:
+    """Split the arguments string into a list of arguments"""
    args = []
    current_arg = ''
    within_string = False
@@ -185,13 +197,15 @@ def smart_resize(
    max_aspect_ratio_allowed: Optional[float] = None,
    size_can_be_smaller_than_factor: bool = False,
 ):
-    """Rescales the image so that the following conditions are met:
+    """
+    The function is modified from https://github.com/QwenLM/Qwen2.5-VL/blob/main/qwen-vl-utils/src/qwen_vl_utils/vision_process.py

-    1. Both dimensions (height and width) are divisible by 'factor'.
+    Qwen2.5-VL based model need this function to resize screenshots.

-    2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
-
-    3. The aspect ratio of the image is maintained as closely as possible.
+    Rescales the image so that the following conditions are met:
+        1. Both dimensions (height and width) are divisible by 'factor'.
+        2. The total number of pixels is within the range ['min_pixels', 'max_pixels'].
+        3. The aspect ratio of the image is maintained as closely as possible.

    """
    if not size_can_be_smaller_than_factor and (height < factor or width < factor):
@@ -218,39 +232,29 @@ def smart_resize(
    return h_bar, w_bar

 def _coordinate_projection(x, y, screen_width, screen_height, coordinate_type):
-        if coordinate_type == "relative":
+    """Project the coordinates to the absolute scale"""
+    if coordinate_type == "relative":
+        return int(round(x * screen_width)), int(round(y * screen_height))
+    elif coordinate_type == "absolute":
+        return x, y
+    elif coordinate_type == "qwen25":
+        if 0 <= x <= 1 and 0 <= y <= 1:
+            # If already normalized, treat like "relative"
            return int(round(x * screen_width)), int(round(y * screen_height))
-        elif coordinate_type == "absolute":
-            return x, y
-        elif coordinate_type == "qwen25":
-            if 0 <= x <= 1 and 0 <= y <= 1:
-                # If already normalized, treat like "relative"
-                return int(round(x * screen_width)), int(round(y * screen_height))

-            height, width = smart_resize(
-                height=screen_height, 
-                width=screen_width, 
-                factor=28, 
-                min_pixels=3136, 
-                max_pixels=12845056
-            )
-            return int(x / width * screen_width), int(y / height * screen_height)
-        elif coordinate_type == "relative1000":
-            if screen_width == 0 or screen_height == 0:
-                raise ValueError("Screen width and height must be greater than zero for relative1000 coordinates.")
-            x_abs = int(round(x * screen_width / 1000))
-            y_abs = int(round(y  * screen_height / 1000))
-            return x_abs, y_abs
-        else:
-            raise ValueError(f"Unsupported coordinate type: {coordinate_type}")
+        height, width = smart_resize(
+            height=screen_height, 
+            width=screen_width, 
+            factor=28, 
+            min_pixels=3136, 
+            max_pixels=12845056 # We use this max_pixels setting in our training data
+        )
+        return int(x / width * screen_width), int(y / height * screen_height)
+    else:
+        raise ValueError(f"Unsupported coordinate type: {coordinate_type}")

 def project_coordinate_to_absolute_scale(pyautogui_code_relative_coordinates, screen_width, screen_height, coordinate_type="relative"):
-    """
-    Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size.
-    """
-    import re
-    import ast
-
+    """Convert the relative coordinates in the pyautogui code to absolute coordinates based on the logical screen size."""
    if coordinate_type not in ["relative", "relative1000", "absolute", "qwen25"]:
        raise ValueError(f"Invalid coordinate type: {coordinate_type}. Expected one of ['relative', 'relative1000', 'absolute', 'qwen25'].")

@@ -426,8 +430,7 @@ def update_code_with_new_coordinates(code, updated_positions):
    Returns:
        str: The updated Python code.
    """
-    # TODO: the matching logics in 'update_code_with_new_coordinates'
-    # and 'extract_positions_and_instructions' are not exactly the same
+
    lines = code.splitlines()
    updated_code_lines = []
    position_index = 0  # Tracks which position update to use
@@ -463,36 +466,51 @@ def update_code_with_new_coordinates(code, updated_positions):
    return "\n".join(updated_code_lines)

 def transform_agnet_action_to_code_block(action):
+    """Transform the agent action to a code block: not used in agent, for logging only"""
    if "computer.terminate" in action or "browser.select_option" in action or "browser.clear" in action:
        return f"```code\n{action}\n```"
    else:
        return f"```python\n{action}\n```"

 class OpenCUAAgent:
+    """
+    OpenCUA Agent for desktop automation tasks.
+    
+    This class implements a OpenCUA Model based agent that can observe 
+    desktop environments through screenshots and execute mouse/keyboard actions 
+    via PyAutoGUI to complete automation tasks.
+    
+    Attributes:
+        model (str): Name of the language model being used
+        history_type (str): Type of history recording mechanism
+        actions (list): History of executed actions
+        observations (list): History of environment observations
+        cots (list): Chain of thought reasoning records
+    """
    def __init__(
            self,
-            model,
-            history_type: str,
-            max_image_history_length: int,
-
-            platform="ubuntu",
-            
-            max_tokens=1500,
-            top_p=0.9,
-            temperature=0,
-            action_space="pyautogui",
-            observation_type="screenshot",
-            cot_level: str = "l2",
-            
-            screen_size=(1920, 1080),
-            coordinate_type: str = "relative", # relative or qwen25
-
-            detail_history_length: int = 0,
+            model: str, # OpenCUA model name
+            history_type: str, # History step type: action_history, thought_history, observation_history
+            max_image_history_length: int = 3, # The max number of images in the history
+            platform: str = "ubuntu", # The platform of the computer
+            max_tokens: int = 1500, # The max number of tokens in the response
+            top_p: float = 0.9, # The top p value in the response
+            temperature: float = 0, # The temperature value in the response
+            action_space: str = "pyautogui", # The action space: pyautogui
+            observation_type: str = "screenshot", # The observation type: screenshot
+            cot_level: str = "l2", # The CoT level: l1, l2, l3
+            screen_size: Tuple[int, int] = (1920, 1080), # The screen size
+            coordinate_type: str = "relative", # The coordinate type: relative, absolute, qwen25
            **kwargs
    ):
-        self.platform = platform
+        assert coordinate_type in ["relative", "absolute", "qwen25"]
+        assert action_space in ["pyautogui"], "Invalid action space"
+        assert observation_type in ["screenshot"], "Invalid observation type"
+        assert history_type in ["action_history", "thought_history", "observation_history"]
+        assert model is not None, "Model cannot be None"
+
        self.model = model
-        assert self.model is not None, "Executor model cannot be None"
+        self.platform = platform
        self.max_tokens = max_tokens
        self.top_p = top_p
        self.temperature = temperature
@@ -500,19 +518,9 @@ class OpenCUAAgent:
        self.observation_type = observation_type
        self.history_type = history_type
        self.coordinate_type = coordinate_type
-        assert coordinate_type in ["relative", "relative1000", "absolute", "qwen25"]
-        assert action_space in ["pyautogui"], "Invalid action space"
-        assert observation_type in ["screenshot"], "Invalid observation type"
-        assert history_type in ["action_history", "thought_history", "observation_history"]
-        
-        self.actions = []
-        self.observations = []
-        self.cots = []
-
        self.cot_level = cot_level
        self.screen_size = screen_size
        self.max_image_history_length = max_image_history_length
-        self.detail_history_length = detail_history_length

        if history_type == "action_history":
            self.HISTORY_TEMPLATE = ACTION_HISTORY_TEMPLATE
@@ -522,15 +530,27 @@ class OpenCUAAgent:
            self.HISTORY_TEMPLATE = OBSERVATION_HISTORY_TEMPLATE
        else:
            raise ValueError(f"Invalid history type: {history_type}")
+        
+        if cot_level == "l3":
+            self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L3
+        elif cot_level == "l2":
+            self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L2
+        elif cot_level == "l1":
+            self.SYSTEM_PROMPT = AGNET_SYS_PROMPT_L1
+        else:
+            raise ValueError(f"Invalid COT level: {cot_level}")
+
+        self.actions = []
+        self.observations = []
+        self.cots = []

    def reset(self, _logger=None):
        global logger
        logger = _logger if _logger is not None else logging.getLogger("desktopenv.agent")
        
        self.observations = []
-        self.thoughts = []
+        self.cots = []
        self.actions = []
-        self.image_summaries = []
    
    def _scale_scroll_for_windows(self, code: str, factor: int = 50) -> str:
        """ pyautogui.scroll has a different scale on Ubuntu and Windows, multiple 'factor' when scrolling on Windows system"""
@@ -541,7 +561,7 @@ class OpenCUAAgent:
        code = pattern_pos.sub(lambda m: f"{m.group(1)}{int(m.group(2))*factor})", code)
        return code
    
-    def predict(self, instruction: str, obs: Dict, **kwargs) -> List:
+    def predict(self, instruction: str, obs: Dict, **kwargs) -> Tuple[str, List[str], Dict]:
        """
        Predict the next action(s) based on the current observation.
        """
@@ -557,31 +577,10 @@ class OpenCUAAgent:
            print("Logical screen size", self.screen_size)

        messages = []
-
-        if self.cot_level == "l3":
-            messages.append({
+        messages.append({
                "role": "system",
-                "content": AGNET_SYS_PROMPT_L3
+                "content": self.SYSTEM_PROMPT
            })
-        elif self.cot_level == "l2":
-            messages.append({
-                "role": "system",
-                "content": AGNET_SYS_PROMPT_L2
-            })
-        elif self.cot_level == "l1":
-            messages.append({
-                "role": "system",
-                "content": AGNET_SYS_PROMPT_L1
-            })
-        elif self.cot_level == "l0":
-            messages.append({
-                "role": "system",
-                "content": AGNET_SYS_PROMPT_L0
-            })
-        else:
-            raise ValueError(f"Invalid COT level: {self.cot_level}")
-        
-        instruction_prompt = INSTRUTION_TEMPLATE.format(instruction=instruction)

        history_step_texts = []
        for i in range(len(self.actions)):
@@ -596,19 +595,11 @@ class OpenCUAAgent:
                    ]
                })

-                if self.detail_history_length > 0 and i >= len(self.actions) - self.detail_history_length:
-                    history_content = STEP_TEMPLATE.format(step_num=i+1) + DETAIL_HISTORY_TEMPLATE.format(
-                        observation=self.cots[i].get('observation'),
-                        thought=self.cots[i].get('thought'),
-                        action=self.cots[i]['action'],
-                        code=self.cots[i]['original_code']
-                    )
-                else:
-                    history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
-                        observation=self.cots[i].get('observation'),
-                        thought=self.cots[i].get('thought'),
-                        action=self.cots[i]['action']
-                    )
+                history_content = STEP_TEMPLATE.format(step_num=i+1) + self.HISTORY_TEMPLATE.format(
+                    observation=self.cots[i].get('observation'),
+                    thought=self.cots[i].get('thought'),
+                    action=self.cots[i]['action']
+                )

                messages.append({
                    "role": "assistant",
@@ -636,26 +627,11 @@ class OpenCUAAgent:
                },
                {
                    "type": "text",
-                    "text": instruction_prompt
+                    "text": INSTRUTION_TEMPLATE.format(instruction=instruction)
                }
            ]
        })

-        # Print message structure if needed
-        # logger.info("\nMessages structure:")
-        # messages_to_print = []
-        # current_image = 1
-        # for msg in messages:
-        #     msg_copy = copy.deepcopy(msg)
-        #     if isinstance(msg_copy['content'], list):
-        #         for content in msg_copy['content']:
-        #             if content['type'] == 'image_url':
-        #                 content['image_url']['url'] = f'Image {current_image}'
-        #                 current_image += 1
-        #     messages_to_print.append(msg_copy)
-
-        # logger.info(json.dumps(messages_to_print, indent=2))
-
        response = self.call_llm({
            "model": self.model,
            "messages": messages,
@@ -667,7 +643,7 @@ class OpenCUAAgent:
        logger.info(f"Model Output: \n\n{response}")
        if not response:
            logger.error("No response found in the response.")
-            return response, [], {}
+            return "ERROR", [], {}

        low_level_instruction, pyautogui_actions, other_cot = parse_response_to_cot_and_action(response, self.screen_size, self.coordinate_type)
        if not pyautogui_actions:
@@ -683,13 +659,34 @@ class OpenCUAAgent:
        logger.info(f"Parsed pyautogui Action: \n{pyautogui_actions}")

        self.actions.append(low_level_instruction)
+        if 'action' not in other_cot or not other_cot['action'] or 'thought' not in other_cot or not other_cot['thought']:
+            logger.error("Error! no action/thought in cot")
+            logger.error(f"response: {response}")
+            logger.error(f"cot: {other_cot}")
        self.cots.append(other_cot)
-            
+        
+        # Print message structure if needed
+        logger.info(f"\nInstruction: {instruction}")
+        messages_to_print = []
+        current_image = 1
+        for msg in messages:
+            msg_copy = copy.deepcopy(msg)
+            if isinstance(msg_copy['content'], list):
+                for content in msg_copy['content']:
+                    if content['type'] == 'image_url':
+                        content['image_url']['url'] = f'Image {current_image}'
+                        current_image += 1
+            messages_to_print.append(msg_copy)
+
+        messages_to_print.append({
+            "new_step_cot": other_cot,
+            "response": response
+        })
+        logger.info(json.dumps(messages_to_print, indent=2))
+
        return response, pyautogui_actions, {}
-            # return response, [parsed_action]
            
    
-    
    @backoff.on_exception(
        backoff.constant,
        # here you should add more model exceptions as you want,
@@ -703,6 +700,7 @@ class OpenCUAAgent:
        max_tries=10
    )
    def call_llm(self, payload, model):
+        """Call the LLM API"""
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {os.environ['OPENCUA_API_KEY']}"