uipath v2 (#413)

* submission v2 * small updates
2026-01-09 02:47:20 +02:00
parent 5ef8bdfa35
commit 5463d3bb89
11 changed files with 643 additions and 425 deletions
--- a/mm_agents/uipath/README.md
+++ b/mm_agents/uipath/README.md
@@ -1,5 +1,13 @@
 # UiPath Screen Agent

+### 23 Dec 2025
+- Updated the planner model to [Claude 4.5 Opus](https://www.anthropic.com/news/claude-opus-4-5)
+- Updated the grounder model to an internally finetuned version of [Qwen3-VL](https://github.com/QwenLM/Qwen3-VL) and allowing it to predict "refusal" (similar to OSWorld-G) for elements that do not exist 
+- Added memory for storing relevant information across steps
+- Improved utilization of the UI element detector for fine grained details (such as cell corners)
+- Refactoring and various small fixes
+
+### 18 Sep 2025
 We propose a simple, yet effective implementation of a Computer Use Agent, which achieves a performance of **53.6%** on the **OSWorld** benchmark with 50 steps, demonstrating competitive results with a relatively lightweight setup and UI only actions. 

 Our system builds upon recent approaches in agentic computer use and follows the literature in adopting a two-stage architecture that separates high-level reasoning from low-level execution. Specifically, the system is composed of:
@@ -32,7 +40,7 @@ The interaction history is structured as a conversation: the user reports the ta
 By combining the current state with this structured history, the Action Planner generates context-aware, informed predictions at every step, being able to reconstruct the sequence of actions that led him to this point, noticing eventual failures, and plan the subsequent steps.

 We support a concise set of actions for interacting with the environment, focusing specifically on UI-related activities:
- Click (left, right, double click)
+- Click (left, right, double, triple, click)
 - Type
 - Scroll
 - Drag
@@ -68,4 +76,3 @@ This process gives the model multiple opportunities to predict within a relevant

 ## Conclusion
 Our method offers a clean and simple yet competitive pipeline for Computer Use tasks. It is cost effective, minimizing token usage during planning, avoiding parallel planning and reliance on numerous past images, and incorporate only **direct UI actions** with refined grounding actions to improve accuracy. With this approach, we achieve **53.6%** accuracy on OSWorld with a 50-step horizon.
-
--- a/mm_agents/uipath/action_planner.py
+++ b/mm_agents/uipath/action_planner.py
@@ -1,7 +1,9 @@
 import datetime
 import json
-from collections import OrderedDict
 import time
+from collections import OrderedDict
+from copy import deepcopy
+
 import mm_agents.uipath.llm_client as llm_client
 from mm_agents.uipath.types_utils import (
    PlanAction,
@@ -11,43 +13,54 @@ from mm_agents.uipath.types_utils import (
 )
 from mm_agents.uipath.action_planner_prompt_builder import (
    ComputerUseAgentInterface,
-    PlanerCoTSections,
-    user_command_template,
+    PlanerCoTSectionsType,
+    user_command_template_chat,
    user_task_info_template,
-    PlannerOutput,
 )
-from mm_agents.uipath.utils import ValidationException, parse_message_json
+from mm_agents.uipath.utils import ValidationException, parse_message_json, ExecutionInfo
+from mm_agents.uipath.memory import ShortTermMemoryManager
+
+    
+class PlannerOutput(object):
+    def __init__(self, plan_action: PlanAction, additional_sections: dict[str, str]):
+        self.plan_action = plan_action
+        self.thought = additional_sections["thought"]
+        self.review = additional_sections["review"]
+        self.additional_sections = {key: value for key, value in additional_sections.items() if key not in ["review", "thought"]}


 class ActionPlanner(object):
    def __init__(self):
        self.number_history_steps_with_images = 2
        self.computer_use_agent_interface = ComputerUseAgentInterface()
+        self.short_term_memory_manager = ShortTermMemoryManager()

    def build_message_output_format_info(self) -> str:
        output_dict = OrderedDict({})
-        for _, value in PlanerCoTSections.items():
+        cot_sections: dict[str, dict] = self.computer_use_agent_interface.get_planner_cot_sections()
+        for _, value in cot_sections.items():
            display = value["display"]
            description = value["description"]
            output_dict[display] = description

-        output_dict["action"] = (
-            "<The action to perform in JSON format as specified in the system message>"
-        )
+        output_dict["action"] = "<The action to perform in JSON format as specified in the system message>"

        return json.dumps(output_dict, indent=4, ensure_ascii=False)

-    def get_step_content(
-        self, step: dict, following_step: dict | None
-    ) -> tuple[str, str]:
+    def get_step_content(self, step: dict, following_step: dict | None) -> tuple[str, str]:
        content_dict = OrderedDict({})
        observation_dict = OrderedDict({})

-        observation_dict["Performed actions"] = step["actions"]
+        observation_dict["Performed actions"] = deepcopy(step["actions"])

-        if (
-            "extracted_data" in step["additional_parameters"]
-        ):  # if the step was an extraction step add the dummy extraction action
+        def remove_unused_fields(action: list[dict], keys: list[str]):
+            for act in action:
+                for key in keys:
+                    if key in act:
+                        del act[key]
+        remove_unused_fields(observation_dict["Performed actions"], ["id", "result", "execution_error_message", "detected_items", "description"])
+
+        if "extracted_data" in step["additional_parameters"]:  # if the step was an extraction step add the dummy extraction action
            extraction_action = {
                "type": PlanActionType.ExtractData,
                "description": step["description"],
@@ -56,24 +69,22 @@ class ActionPlanner(object):
            observation_dict["Performed actions"] = [extraction_action]

        if following_step:
-            observation_dict["Observation"] = following_step[
-                "additional_parameters"
-            ].get("review", None)
+            observation_dict["Observation"] = following_step["additional_parameters"].get("review", None)

-        for key, value in PlanerCoTSections.items():
-            if key != "review":
+        cot_sections = self.computer_use_agent_interface.get_planner_cot_sections()
+        for key, value in cot_sections.items():
+            if key not in [PlanerCoTSectionsType.Review, PlanerCoTSectionsType.Memory]:
                param_value = step["additional_parameters"].get(key, None)
                display_name = value["display"]
                content_dict[display_name] = param_value
-        content_dict["actions"] = json.loads(
-            step["additional_parameters"]["plan_action"]
-        )
+        content_dict["action"] = json.loads(step["additional_parameters"]["plan_action"])

        content_dict = json.dumps(content_dict, indent=4, ensure_ascii=False)
        observation_dict = json.dumps(observation_dict, indent=4, ensure_ascii=False)
        return content_dict, observation_dict

-    def build_messages_chat(self, state: State, execution_info: dict) -> list[dict]:
+    def build_messages_chat(self, state: State, execution_state: ExecutionState) -> list[dict]:
+        execution_info = execution_state.execution_info
        messages = []
        system_message = {
            "role": "system",
@@ -82,42 +93,45 @@ class ActionPlanner(object):

        messages.append(system_message)

+        start_index = max(0, len(state.previous_steps) - self.number_history_steps_with_images)
+        end_index = len(state.previous_steps)
+
+        images_dict = {index: state.previous_steps[index]["image"] for index in range(start_index, end_index)}
+
+        # Don't set it for the first iteration as the history is empty anyway
+        user_messages = state.task
+        if end_index == 0:
+            user_task_with_ref_imgs = ""
+            user_messages = [{"type": "text", "text": state.task}]
+        else:
+            user_task_with_ref_imgs = state.task
+            user_messages = [{"type": "text", "text": "Recall the task again:"}, {"type": "text", "text": state.task}]
+
        user_task_info_message = {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": user_task_info_template.format(
-                        task=state.task,
+                        task=user_task_with_ref_imgs,
                        current_date=datetime.datetime.now().strftime("%Y-%m-%d"),
                    ),
                }
            ],
        }
-
        messages.append(user_task_info_message)

-        start_index = max(
-            0, len(state.previous_steps) - self.number_history_steps_with_images
-        )
-        end_index = len(state.previous_steps)
-
        for index in range(0, end_index):
            step = state.previous_steps[index]

            if index >= start_index:
-                assert step["image"] is not None and len(step["image"]) > 0, (
-                    "Step image is empty"
-                )
+                image = images_dict.get(index, None)
+
+                assert image is not None and len(image) > 0, "Step image is empty"
                user_image_message = {
                    "role": "user",
                    "content": [
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": f"data:image/jpeg;base64,{step['image']}"
-                            },
-                        },
+                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{image}"}},
                    ],
                }
                messages.append(user_image_message)
@@ -148,79 +162,98 @@ class ActionPlanner(object):
            }
            messages.append(user_message_reply)

+        memory = json.loads(state.previous_steps[-1]["additional_parameters"].get("memory", "{}")) if len(state.previous_steps) > 0 else {}
+        memory_str = json.dumps(memory, indent=4, ensure_ascii=False) if len(memory) > 0 else "No memory."
+
        last_user_message = {
            "role": "user",
-            "content": [
+            "content": user_messages
+            + [
                {
                    "type": "text",
                    "text": "Current screenshot:",
                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{state.image_base64}"
-                    },
-                },
+                {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{state.image_base64}"}},
                {
                    "type": "text",
-                    "text": user_command_template.format(
-                        task=state.task,
-                        execution_info_message=self.build_execution_info_message(
-                            execution_info
-                        ),
+                    "text": user_command_template_chat.format(
+                        execution_info_message=self.build_execution_info_message(execution_info),
                        json_output_format=self.build_message_output_format_info(),
+                        memory=memory_str,
                    ),
                },
            ],
        }

        messages.append(last_user_message)
+
+        for raw_response in execution_info.responses:
+            if raw_response.grounding_error is not None:
+                ai_message = {
+                    "role": "assistant",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": raw_response.raw_planning_prediction,
+                        }
+                    ],
+                }
+
+                messages.append(ai_message)
+
+                user_message = {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": f"Grounder model error detected. Could not identify the element with description: '{raw_response.grounding_error.element_description}', error {raw_response.grounding_error.message}. Possible reasons:the description is not precise enough for the grounder or the element is not visible on the screenshot. If providing a new description does not work, try to complete the action through another path than using that specific button (either by changing the element to be clicked or providing another action such as a hotkey if any exist).",
+                        }
+                    ],
+                }
+                messages.append(user_message)
        return messages

-    def extract_response(
-        self, response_content: str
-    ) -> tuple[PlanAction, dict[str, str]]:
-        cot_sections_lst = list(PlanerCoTSections.keys())
-
+    def extract_response(self, response_content: str) -> tuple[PlanAction, dict[str, str]]:
        additional_sections = OrderedDict({})
        response_json = parse_message_json(response_content)
+        cot_sections = self.computer_use_agent_interface.get_planner_cot_sections()
+        cot_sections_lst = list(cot_sections.keys())

        for section in cot_sections_lst:
-            section_display = PlanerCoTSections[section]["display"]
+            section_display = cot_sections[section]["display"]
            if section_display not in response_json:
-                raise ValidationException(
-                    f"Invalid response format, '{section}' key not found: {response_content}"
-                )
-            additional_sections[section] = response_json.get(
-                PlanerCoTSections[section]["display"]
-            )
+                raise ValidationException(f"Invalid response format, '{section_display}' key not found: {response_content}")
+            additional_sections[section] = response_json.get(section_display)

        if "action" not in response_json:
-            raise ValidationException(
-                f"Invalid response format, 'action' key not found: {response_content}"
-            )
+            raise ValidationException(f"Invalid response format, 'action' key not found: {response_content}")

        action_dict = response_json["action"]

-        plan_action = PlanAction.from_dict(self.correct_action_type(action_dict))
+        plan_action = PlanAction.from_dict(ActionPlanner.correct_action_type(action_dict))
+
+        if plan_action is None:
+            raise ValidationException(f"Invalid action format: {response_content}")

        if plan_action.action_type == PlanActionType.Drag:
            self.computer_use_agent_interface.validate_action(plan_action)

        return plan_action, additional_sections

-    def build_execution_info_message(self, execution_info: dict) -> str:
+    def build_execution_info_message(self, execution_info: ExecutionInfo) -> str:
        execution_info_message = ""
-        if "planner_action_review" in execution_info:
-            action_description = execution_info["planner_action_review"][
-                "action_description"
-            ]
-            error_message = execution_info["planner_action_review"]["error_message"]
-
-            execution_info_message = f"You predicted this action: '{action_description}' but it is not valid because: {error_message}. If the target element is not visible on the screenshot, scroll first to make the target element visible. If the target element is not correct, change the action description with more precise element description using nearby context."
+        if execution_info.planner_action_review is not None:
+            action_description = execution_info.planner_action_review["action_description"]
+            error_message = execution_info.planner_action_review["error_message"]
+            execution_info_message = f"You predicted this action: '{action_description}' but it is not valid because: {error_message}. If the target element is not visible/fully visible on the screenshot, scroll first to make the target element visible. If the target element is not correct, change the action description with more precise element description using nearby context."
+        elif execution_info.responses and len(execution_info.responses) > 0 and execution_info.responses[-1].grounding_error is not None:
+            grounding_error = execution_info.responses[-1].grounding_error
+            error_message = str(grounding_error)
+            execution_info_message = f"The predicted is not valid because of this {error_message}. If the target element is not visible/fully visible on the screenshot, scroll first to make the target element visible. If the target element is not correct, change the action description with more precise element description using nearby context."
        return execution_info_message

-    def correct_action_type(self, response_json: dict) -> dict:
+    @staticmethod
+    def correct_action_type(response_json: dict) -> dict:
        action_type = response_json.get("type", "").lower()
        if action_type in ("press", "key_press", "press_key"):
            response_json["type"] = "key_press"
@@ -234,11 +267,13 @@ class ActionPlanner(object):
            response_json["type"] = "wait"
        return response_json

-    def predict(self, state: State, execution_state: ExecutionState) -> PlannerOutput:
-        messages = self.build_messages_chat(state, execution_state.execution_info)
+    async def predict(self, state: State, execution_state: ExecutionState) -> PlannerOutput:
+        messages = self.build_messages_chat(state, execution_state)
        llm_messages = [message for message in messages]
-        repeat_count = 2
-        plan, response_content = None, None
+        repeat_count = 3
+        response_content = ""
+        plan_action = None
+        additional_sections = {}
        while repeat_count > 0:
            try:
                payload = {
@@ -250,13 +285,14 @@ class ActionPlanner(object):
                response_content = llm_client.send_messages(payload)
                if response_content is None or len(response_content.strip()) == 0:
                    raise ValidationException("Planner response is None or empty")
-                plan_action, additional_sections = self.extract_response(
-                    str(response_content)
-                )
-                plan = PlannerOutput(plan_action, additional_sections)
+
+                plan_action, additional_sections = self.extract_response(str(response_content))
+                llm_memory_response = additional_sections.get("memory", None)
+                memory_operations = self.short_term_memory_manager.extract_memory_operations(llm_memory_response)
+
+                execution_state.execution_info.current_response.raw_planning_prediction = response_content
                break
            except ValidationException as e:
-                time.sleep(5)
                repeat_count -= 1
                ai_message = {
                    "role": "assistant",
@@ -280,9 +316,15 @@ class ActionPlanner(object):
                llm_messages = messages + [ai_message, error_message]

                if repeat_count == 0:
-                    raise ValueError(
-                        f"Invalid planner response format: {response_content}, {str(e)}"
-                    )
-        if plan is None:
+                    raise ValueError(f"Invalid planner response format: {response_content}")
+        if plan_action is None:
            raise ValueError("Planner response is not valid")
-        return plan
+        planner_output = PlannerOutput(
+            plan_action=plan_action,
+            additional_sections=additional_sections,
+        )
+        updated_memory = await self.short_term_memory_manager.get_updated_memory(
+            state, memory_operations, execution_state=execution_state
+        )
+        planner_output.additional_sections["memory"] = json.dumps(updated_memory, indent=4, ensure_ascii=False)
+        return planner_output
--- a/mm_agents/uipath/action_planner_prompt_builder.py
+++ b/mm_agents/uipath/action_planner_prompt_builder.py
@@ -1,8 +1,11 @@
 from collections import OrderedDict
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional
+
+from enum import Enum
 from mm_agents.uipath.types_utils import PlanAction, key_maps
 from mm_agents.uipath.utils import ValidationException
+from mm_agents.uipath.memory import memory_system_template

 system_template = """You are a computer use agent that perform computer-related tasks.
 You will be given a task, a current screenshot, and a list of previous actions. You need to predict the next action.
@@ -25,91 +28,144 @@ Your action response must be a valid JSON with the following format:
 {{
    "type": str  # one of the valid action types
    "description": # action description
-    "parameters": # optional, action parameters dictionary 
+    "parameters": # optional, action parameters dictionary    
 }}

 ## Action examples: example of valid actions:
 {examples}

-## Important Notes:
- Close any cookies, ads, login or registration etc pop-ups if not needed.
- Before typing, ensure the input box is focused by clicking on it.
+## Action Sequence Example:
+Here is an example of the correct sequence for typing text into an input field.
+
+Step 1: Scroll to make the 'Username' input field fully visible.
+
+{{
+  "type": "scroll",
+  "description": "Scroll page to make the 'Username' input field fully visible."
+  "parameters": {{"element_description": "the main page", "direction": "down", "distance": 3}}
+}}
+
+Step 2: Click the input field to focus it.
+
+{{
+  "type": "click",
+  "description": "Click the 'Username' input field."
+}}
+
+Step 3: Type the desired text.
+
+{{
+  "type": "type",
+  "description": "Type 'testuser' into the focused 'Username' input field.",
+  "parameters": {{
+    "text": "testuser"
+  }}
+}}
+
+## Important Rules:
+CRITICAL: Always click to focus an input field before using the type action if it is not focused already from a previous step. The model must predict a click on the element, and then in the next step, predict the type action.
+Close any cookies, ads, login or registration pop-ups if they are not needed for the task.
+Before finish action, ensure all necessary data entries or selections are committed by performing appropriate actions (e.g., pressing 'Enter'/ 'Tab', Ctrl+S for saving documents or clicking 'Save', changing focus, or blurring the input field).
+- **Strict Adherence**: Only perform actions the user has explicitly requested; avoid unnecessary steps. E.g. For colors, ensure that if user requested to use "green" you use the color named green, not light green or other shades.
+- CRITICAL: Make sure the modified files or settings are saved and if no file name is specified in the user task, use the default settings that appear.
+- Dismiss "Authentication required" prompts by clicking "Cancel".
+- Leave windows/applications open at task completion.
+- **Completion Criteria**: Only finish when all user requirements are met in full and all running commands have finished.
+- **Impossibility Handling**: Return failure if completion is blocked by environmental constraints.
+- You must never logout/close the computer, otherwise you won't be able to interact with the environment, if an action requires this, mark it as failure
 """

-user_command_template = """Recall Task Again: {task}
-Check if the task is finished. If not provide the next action to perform.
-Remember:
- Perform the task on provided application(s) or website(s). You are not allowed to use the browser "address bar".
- Close any cookies, ads, login or registration etc pop-ups if not needed.
- Only one action at a time (never "click and type", "click and drag", "type and press", "press shift and click", etc..). Think of how to combine them in two consecutive actions obtaining the intended result or use an available action that can obtain it.
- For any opening input combobox, dropdown menu options, you must select an option or press Enter key to select default one.
- Click on input box to ensure is focused before typing. Otherwise, the input box will not accept the text.
- Once focusing on an input box, if it has a default pre-typed value (not placeholder which is usually grayed-out), remove the existing value first by clicking on "X" icon or using "Ctrl A" + "Backspace" or "Backspace" if the value is already selected.
- For search input, if no search button or suggestions popup after typing, press 'Enter' to trigger search.
- Retry the drag action on slider control if needed to refine the slider values closer to expected values.
- Scroll / Pageup / Pagedown to explore or extract more content/data if needed (prefer 'key_press' action with key 'Pageup', 'Pagedown' for faster scrolling). Particularly when extraction data from table with hidden rows or columns.
- Scroll action must have a 'direction' parameter. Finish action must have a 'status' parameter.
- If you modify some settings remember to save/apply them. If button is not visible try to scroll for it.
+user_message_template = """Here are the current information:
+The current date is (YYYY-MM-DD): {current_date}
+Task: {task}

-Most importantly, never type or click on element not visible on screenshot. Use scroll or pageup/pagedown to make the element visible first.
-
-{execution_info_message}
-Answer in json format:
-{json_output_format}
+Previous actions:
+{history}
 """

-PlanerCoTSections = OrderedDict(
-    {
-        "review": {
-            "display": "previous_action_result",
-            "description": "Briefly describe the previous action result and UI change on the screenshot to see if is correctly performed.",
-        },
-        "thought": {
-            "display": "thought",
-            "description": "Reason briefly about the next action to perform if the task is not finished.",
-        },
-        "action_description": {
-            "display": "action_description",
-            "description": "Describe the action to perform in a single sentence. The description must be precise and not rely on specific information in the current screen.",
-        },
-    }
-)
-
-
 ### for chat conversation
 user_task_info_template = """## Task Information:
 The current date is (YYYY-MM-DD): {current_date}
 Task: {task}
 """

+user_command_template_chat = """Current Memory: {memory}
+Check if the task is finished. If not provide the next action to perform.
+Remember:
+- Perform the task on provided application(s) or website(s). You are not allowed to use the browser "address bar".
+- Close any cookies, ads, login or registration etc pop-ups if not needed.
+- Only one action at a time (never "click and type", "click and drag", "type and press" etc..).
+- For any opening input combobox, dropdown menu options, you must select an option or press Enter key to select default one.
+- Caret is not always visible in input box even when the input box is focused
+- CRITICAL: Scroll to make the target element fully visible on the screenshot before clicking or typing on it. Never click or type on an element not fully visible on the screenshot.
+- CRITICAL: Before typing ensure the element is focused by first clicking it. Otherwise, the input box will not accept the text.
+- Once focusing on an input box, if it has a default pre-typed value (not placeholder which is usually grayed-out), remove the existing value first by clicking on "X" icon or using "Ctrl A" + "Backspace" or "Backspace" if the value is already selected.
+- For search input, if no search button or suggestions popup after typing, press 'Enter' to trigger search.
+- Retry the drag action on slider control if needed to refine the slider values closer to expected values.
+- Scroll / Pageup / Pagedown to explore or extract more content/data if needed (prefer 'key_press' action with key 'Pageup', 'Pagedown' for faster scrolling). Particularly when extraction data from table with hidden rows or columns.
+- Scroll action must have a 'direction' parameter. Finish action must have a 'status' parameter.
+
+MOST IMPORTANTLY, never type or click on element not visible on screenshot. Use scroll or pageup/pagedown to make the element visible first.
+
+{execution_info_message}
+Answer in json format:
+{json_output_format}
+"""
+
+user_command_template = """Recall Task Again: {task}\n""" + user_command_template_chat
+
+
+class PlanerCoTSectionsType(str, Enum):
+    Review = "review"
+    Thought = "thought"
+    ActionDescription = "action_description"
+    Memory = "memory"
+
+PlanerCoTSections = OrderedDict(
+    {
+        PlanerCoTSectionsType.Review: {
+            "display": "previous_action_result",
+            "description": "Briefly describe the previous action result and UI change on the screenshot to see if is correctly performed.",
+        },
+        PlanerCoTSectionsType.Thought: {"display": "thought", "description": "Reason briefly about the next action to perform if the task is not finished."},
+        PlanerCoTSectionsType.ActionDescription: {
+            "display": "action_description",
+            "description": "Describe the action to perform in a single sentence. The description must be precise and not rely on specific information in the current screen.",
+        },
+        PlanerCoTSectionsType.Memory: {
+            "display": "update_memory",
+            "description": "<Proceed with a memory update considering the previous actions. Emit a list of memory operations. If no memory update is needed, emit an empty list>",
+        },
+    }
+)
+

@dataclass
 class ActionDefinition:
+    """Simple action definition with description, parameters, and examples"""
+
    type: str
    description: str
    parameters: Optional[Dict[str, str]] = None
    examples: List[Dict[str, Any]] = field(default_factory=list)


-class PlannerOutput(object):
-    def __init__(self, plan_action: PlanAction, additional_sections: dict[str, str]):
-        self.plan_action = plan_action
-        self.thought = additional_sections["thought"]
-        self.review = additional_sections["review"]
-        self.additional_sections = {
-            key: value
-            for key, value in additional_sections.items()
-            if key not in ["review", "thought"]
-        }
-
-
 class ComputerUseAgentInterface:
+    """Simple computer use agent with modular action definitions"""
+
    def __init__(self):
        self.ui_actions = {}
        self.special_actions = {}
        self._setup_default_actions()

+    def get_planner_cot_sections(self) -> OrderedDict:
+        cot_sections = PlanerCoTSections.copy()
+        return cot_sections
+
    def _setup_default_actions(self):
+        """Define all available actions"""
+
+        # Click action - no parameters
        self.add_action(
            ActionDefinition(
                type="click",
@@ -120,124 +176,121 @@ class ComputerUseAgentInterface:
                        "type": "click",
                        "description": "Click the 'X' icon in the input box",
                    },
-                    {
-                        "type": "click",
-                        "description": "Click the first name input box to focus on it.",
-                    },
+                    {"type": "click", "description": "Click the first name input box to focus on it."},
                ],
            )
        )

+        # Right click action - no parameters
        self.add_action(
            ActionDefinition(
                type="right_click",
                description="Right click on a UI element",
-                examples=[
-                    {
-                        "type": "right_click",
-                        "description": "Right click on the first row from the patient table to open the context menu.",
-                    }
-                ],
+                examples=[{"type": "right_click", "description": "Right click on the first row from the patient table to open the context menu."}],
            )
        )

+        # Double click action - no parameters
        self.add_action(
            ActionDefinition(
                type="double_click",
                description="Double click on a UI element",
                examples=[
-                    {
-                        "type": "double_click",
-                        "description": "Double click word app icon to open the application.",
-                    },
+                    {"type": "double_click", "description": "Double click word app icon to open the application."},
+                ],
+            )
+        )
+        
+        # Triple click action - no parameters
+        self.add_action(
+            ActionDefinition(
+                type="triple_click",
+                description="Triple click on a UI element",
+                examples=[
+                    {"type": "triple_click", "description": "Triple click the second paragraph to select it."},
                ],
            )
        )

+        # Type action - with text parameter
        self.add_action(
            ActionDefinition(
                type="type",
                description="Type text into a focused input field. Ensure the input box is focused before typing. To focus the input box, you may need to click on it first.",
                parameters={"text": "str - the text to be typed"},
                examples=[
-                    {
-                        "type": "type",
-                        "description": "Type 'John' in the first name input box.",
-                        "parameters": {"text": "John"},
-                    },
-                    {
-                        "type": "type",
-                        "description": "Type 'Doe' in the last name input box.",
-                        "parameters": {"text": "Doe"},
-                    },
-                    {
-                        "type": "type",
-                        "description": "Type 'Hello, world!' in the text area.",
-                        "parameters": {"text": "Hello, world!"},
-                    },
+                    {"type": "type", "description": "Type 'John' in the first name input box.", "parameters": {"text": "John"}},
+                    {"type": "type", "description": "Type 'Doe' in the last name input box.", "parameters": {"text": "Doe"}},
+                    {"type": "type", "description": "Type 'Hello, world!' in the text area.", "parameters": {"text": "Hello, world!"}},
                ],
            )
        )

+        # Scroll action - with direction parameter
        self.add_action(
            ActionDefinition(
                type="scroll",
                description="Scroll an UI element in a specified direction",
                parameters={
+                    "element_description": "str - description of the element to be scrolled such that the executor can locate it",
                    "direction": "str - 'up', 'down', 'left', or 'right'",
-                    "distance": "int - the number of scroll steps (wheel “clicks”) to send.",
+                    "distance": "int - number of 'clicks' to scroll, e.g. on windows, 1 click = 120 units of scroll internally",
                },
                examples=[
                    {
                        "type": "scroll",
-                        "description": "Scroll down to see more content.",
-                        "parameters": {"direction": "down"},
+                        "description": "Scroll down the user table to see more content.",
+                        "parameters": {"element_description": "Users table", "direction": "down", "distance": "6"},
                    },
                    {
                        "type": "scroll",
                        "description": "Scroll up to the top of the page.",
-                        "parameters": {"direction": "up"},
+                        "parameters": {"element_description": "the main page", "direction": "up"},
                    },
                ],
            )
        )

+        # Drag action
        self.add_action(
            ActionDefinition(
                type="drag",
-                description="Drag an element or the mouse (with left click on) from one location to another. You must specify both start_description and end_description.",
-                parameters={
-                    "start_description": "description of the location to start dragging",
-                    "end_description": "description of the location to drag to",
-                },
+                description="Drag an element or the mouse (with left click on) from one location to another.",
+                parameters={"start_description": "description of the location to start dragging", "end_description": "description of the location to drag to"},
                examples=[
                    {
                        "type": "drag",
                        "description": "Drag the response.txt file to the responses folder",
-                        "start_description": "Click the response.txt file",
-                        "end_description": "Click the responses folder",
+                        "parameters": {
+                            "start_description": "the response.txt file",
+                            "end_description": "the responses folder",
+                        },
+                    },
+                    {
+                        "type": "drag",
+                        "description": "Drag the profile picture image into the upload box",
+                        "parameters": {
+                            "start_description": "the profile picture image",
+                            "end_description": "the upload box",
+                        },
                    },
                ],
            )
        )

+        # Mouse move action
        self.add_action(
            ActionDefinition(
                type="mouse_move",
                description="Move the mouse to a specific element",
                examples=[
-                    {
-                        "type": "mouse_move",
-                        "description": "Move the mouse to the 'Submit' button.",
-                    },
-                    {
-                        "type": "mouse_move",
-                        "description": "Hover over the 'Settings' icon.",
-                    },
+                    {"type": "mouse_move", "description": "Move the mouse to the 'Submit' button."},
+                    {"type": "mouse_move", "description": "Hover over the 'Settings' icon."},
                ],
            )
        )

+        # Key press action - with key parameter
        self.add_action(
            ActionDefinition(
                type="key_press",
@@ -246,50 +299,55 @@ class ComputerUseAgentInterface:
                    "key": f'str  # the key or key combination (separated by space) to be pressed. Example of key combination "Ctrl A", "Shift Tab", "Ctrl C" etc. "<Key> + Click" is not a valid combination, use two separate actions. Beside normal keys like letters, numerics, punctuations etc.. here are special key list: {key_maps.keys()}.'
                },
                examples=[
-                    {
-                        "type": "key_press",
-                        "description": "Press 'Ctrl A' to select all text.",
-                        "parameters": {"key": "Ctrl A"},
-                    },
-                    {
-                        "type": "key_press",
-                        "description": "Press Pagedown key.",
-                        "parameters": {"key": "Pagedown"},
-                    },
+                    {"type": "key_press", "description": "Press 'Ctrl A' to select all text.", "parameters": {"key": "Ctrl A"}},
+                    {"type": "key_press", "description": "Press Pagedown key.", "parameters": {"key": "Pagedown"}},
                ],
            )
        )

+        # Extract data action - with variable parameter
        self.add_special_action(
            ActionDefinition(
                type="extract_data",
                description="Use to extract some data from the screen for the task. This data will be stored in memory and used in the next actions or returned in the final result.",
-                parameters={
-                    "description": "str - short description of the data to be extracted",
-                    "data": "str|json - the data to be extracted",
-                },
+                parameters={"description": "str - short description of the data to be extracted", "data": "str|json - the data to be extracted"},
                examples=[
                    {
                        "type": "extract_data",
                        "description": "Extract the product name and price from the screen.",
-                        "parameters": {
-                            "description": "Available product name and price",
-                            "data": "Product Name: iPhone 14, Price: $999",
-                        },
+                        "parameters": {"description": "Available product name and price", "data": "Product Name: iPhone 14, Price: $999"},
                    },
                ],
            )
        )

+        # Wait action
+        self.add_special_action(
+            ActionDefinition(
+                type="wait",
+                description="Use it to wait for the completion of an event.",
+                examples=[
+                    {"type": "wait", "description": "Wait for the running command to finish."},
+                ],
+            )
+        )
+
+        # Finish action - with status parameter
        self.add_special_action(
            ActionDefinition(
                type="finish",
-                description=" Use it to finish the task with success or failure status. When you think the task was finished return success, while when you think can not be done, return failure, don't easily say failure, try your best to do the task.",
+                description=(
+                    "Use it to finish the task with success or failure. "
+                    "Before finishing, ensure all necessary data entries or selections required by the task are committed by performing appropriate actions (e.g., pressing 'Enter'/ 'Tab', pressing CTRL + S to save the document or clicking 'Save', changing focus, or blurring the input field). After typing a value that should be set/submitted, perform a COMMIT action (Enter, Tab, click Save/Apply or blur) before using the finish action.",
+                    "Do not use the finish action while any essential process or command (e.g., downloading data, running a script, loading results) is still in progress; wait for it (emmit wait action) to fully complete before finishing. ",
+                    "Failure status is used when the task is impossible to complete or you are unable to complete it (e.g. stuck in a loop, etc)."
+                ),
                parameters={"status": "str - 'success' or 'failure'"},
                examples=[
+                    {"type": "finish", "description": "Task completed successfully.", "parameters": {"status": "success"}},
                    {
                        "type": "finish",
-                        "description": "Task completed successfully.",
+                        "description": "After typing 'John Doe' and pressing TAB to save the value, finish the task successfully.",
                        "parameters": {"status": "success"},
                    },
                ],
@@ -297,15 +355,19 @@ class ComputerUseAgentInterface:
        )

    def add_action(self, action: ActionDefinition):
+        """Add a new action to the agent"""
        self.ui_actions[action.type] = action

    def add_special_action(self, action: ActionDefinition):
+        """Add a special action that is not part of the main UI actions"""
        self.special_actions[action.type] = action

    def get_action_definition(self, action_type: str) -> Optional[ActionDefinition]:
+        """Get action definition by type"""
        return self.ui_actions.get(action_type) or self.special_actions.get(action_type)

    def validate_action(self, action: PlanAction):
+        """Validate if the action is valid and has all required parameters"""
        action_definition = self.get_action_definition(action.action_type)
        if action_definition is None:
            raise ValidationException(f"Invalid action type: {action.action_type}")
@@ -313,26 +375,25 @@ class ComputerUseAgentInterface:
        if action_definition.parameters:
            for parameter in action_definition.parameters:
                if parameter not in action.parameters:
-                    raise ValidationException(
-                        f"Missing parameter '{parameter}' in action: {action}"
-                    )
+                    raise ValidationException(f"Missing parameter '{parameter}' in action: {action}")

    def get_system_prompt(self) -> str:
+        """Generate the complete prompt for the agent"""
        indentation = "  "

        def get_action_definition(action: ActionDefinition) -> str:
+            """Format action definitions for the prompt"""
+
            action_prompt = f"- {action.type}: {action.description}"
            if action.parameters is not None and len(action.parameters) > 0:
-                params = (",\n" + 2 * indentation).join(
-                    f"{k}: {v}" for k, v in action.parameters.items()
-                )
-                parameter_def = (
-                    f"{indentation}parameters:\n{indentation}{indentation}{params}"
-                )
+                params = (",\n" + 2 * indentation).join(f"{k}: {v}" for k, v in action.parameters.items())
+                parameter_def = f"{indentation}parameters:\n{indentation}{indentation}{params}"
                action_prompt += "\n" + parameter_def
            return action_prompt

        def get_examples(actions: List[ActionDefinition]) -> list[str]:
+            """Format action examples for the prompt"""
+
            output_examples = []
            for action in actions:
                for example in action.examples:
@@ -343,48 +404,23 @@ class ComputerUseAgentInterface:
                    example_parts = [type_str, description_str]

                    if "parameters" in example:
-                        params = (",\n" + 2 * indentation).join(
-                            f'"{k}": "{v}"' for k, v in example["parameters"].items()
-                        )
-                        parameters_str = (
-                            '"parameters"'
-                            + ": {\n"
-                            + 2 * indentation
-                            + params
-                            + "\n"
-                            + indentation
-                            + "}"
-                        )
+                        params = (",\n" + 2 * indentation).join(f'"{k}": "{v}"' for k, v in example["parameters"].items())
+                        parameters_str = '"parameters"' + ": {\n" + 2 * indentation + params + "\n" + indentation + "}"
                        example_parts.append(parameters_str)
-                    example_json = (
-                        "{\n"
-                        + indentation
-                        + (",\n" + indentation).join(example_parts)
-                        + "\n}"
-                    )
+                    example_json = "{\n" + indentation + (",\n" + indentation).join(example_parts) + "\n}"
                    output_examples.append(example_json)

            return output_examples

-        available_actions = "\n\n".join(
-            get_action_definition(action) for action in self.ui_actions.values()
-        )
-        special_actions = "\n\n".join(
-            get_action_definition(action) for action in self.special_actions.values()
-        )
-        examples = "\n\n".join(
-            get_examples(
-                list(self.ui_actions.values()) + list(self.special_actions.values())
-            )
-        )
+        available_actions = "\n\n".join(get_action_definition(action) for action in self.ui_actions.values())
+        special_actions = "\n\n".join(get_action_definition(action) for action in self.special_actions.values())
+        examples = "\n\n".join(get_examples(list(self.ui_actions.values()) + list(self.special_actions.values())))

-        return system_template.format(
-            available_actions=available_actions,
-            special_actions=special_actions,
-            examples=examples,
-        )
+        out = system_template.format(available_actions=available_actions, special_actions=special_actions, examples=examples)
+        out += "\n\n" + memory_system_template.format()
+        return out


 if __name__ == "__main__":
    agent = ComputerUseAgentInterface()
-    print(agent.get_system_prompt())
+    print(agent.get_system_prompt())
--- a/mm_agents/uipath/agent.py
+++ b/mm_agents/uipath/agent.py
@@ -19,113 +19,19 @@ class UiPathComputerUseV1(object):
        self.planner = ActionPlanner()
        self.executor = GrounderClient()

-    async def predict_request(
-        self, request_body: dict, model_name: str
-    ) -> tuple[dict, dict]:
+    async def predict_request(self, request_body: dict, model_name: str) -> tuple[dict, dict]:
+        previous_steps = request_body['previousSteps'] if request_body['previousSteps'] else []
        state = State(
            task=request_body["userTask"],
            image_base64=request_body["image"],
-            previous_steps=request_body.get("previousSteps", []),
+            previous_steps=[step for step in previous_steps],
        )

-        execution_state = ExecutionState(model_name=model_name, execution_info={})
-        output = await self.predict(state, execution_state)
+        execution_state = ExecutionState(model_name=model_name)
+        output = await self.predict(state, execution_state, max_retries=2)
        return output

-    def process_grounding(
-        self,
-        plan_action: PlanAction,
-        grounding_result: utils.GroundingOutput,
-        x: int,
-        y: int,
-    ):
-        match plan_action.action_type:
-            case PlanActionType.Scroll:
-                # guess the scroll direction if missing in the plan output
-                if "direction" not in plan_action.parameters:
-                    if "scroll up" in plan_action.description.lower():
-                        scroll_direction = "up"
-                    else:
-                        scroll_direction = "down"
-                else:
-                    scroll_direction = plan_action.parameters["direction"]
-
-                action = ComputerUseAction(
-                    name=SupportedActions.Scroll,
-                    description=plan_action.description,
-                    parameters={"position": [x, y], "direction": scroll_direction},
-                )
-
-                if "distance" in plan_action.parameters:
-                    match scroll_direction:
-                        case "up":
-                            action.parameters["offset"] = [
-                                0,
-                                plan_action.parameters["distance"],
-                            ]
-                        case "down":
-                            action.parameters["offset"] = [
-                                0,
-                                -plan_action.parameters["distance"],
-                            ]
-                        case "left":
-                            action.parameters["offset"] = [
-                                plan_action.parameters["distance"],
-                                0,
-                            ]
-                        case "right":
-                            action.parameters["offset"] = [
-                                -plan_action.parameters["distance"],
-                                0,
-                            ]
-            case PlanActionType.Drag:
-                assert grounding_result.end_position is not None, (
-                    "End position must be provided for drag action"
-                )
-                x_end, y_end = grounding_result.end_position
-                action = ComputerUseAction(
-                    name=SupportedActions.Drag,
-                    description=plan_action.description,
-                    parameters={
-                        "path": [
-                            {"x": x, "y": y},
-                            {"x": x_end, "y": y_end},
-                        ]
-                    },
-                )
-            case _:
-                action_name = plan_action.action_type
-                parameters = {"position": [x, y]}
-
-                if plan_action.action_type == PlanActionType.DoubleClick:
-                    action_name = SupportedActions.Click
-                    parameters["click_type"] = "double"
-                elif plan_action.action_type == PlanActionType.RightClick:
-                    action_name = SupportedActions.Click
-                    parameters["button"] = "right"
-                elif plan_action.action_type == PlanActionType.MouseMove:
-                    action_name = SupportedActions.MouseMove  # different names
-
-                assert action_name in [
-                    SupportedActions.Click,
-                    SupportedActions.MouseMove,
-                ]
-                action = ComputerUseAction(
-                    name=action_name,
-                    description=plan_action.description,
-                    parameters=parameters,
-                )
-        return action
-
-    async def predict(
-        self, state: State, execution_state: ExecutionState
-    ) -> tuple[dict, dict]:
-        planer_output: PlannerOutput = self.planner.predict(state, execution_state)
-        plan_action = planer_output.plan_action
-
-        action: ComputerUseAction | None = None
-        step: ComputerUseStep | None = None
-
+    def wrap_to_computer_use_action(self, plan_action: PlanAction, grounding_result: utils.GroundingOutput | None) -> ComputerUseAction:
        match plan_action.action_type:
            case PlanActionType.KeyPress:
                keys = plan_action.parameters["key"].split(" ")
@@ -142,6 +48,125 @@ class UiPathComputerUseV1(object):
                    description=plan_action.description,
                    parameters={},
                )
+            case PlanActionType.Click | PlanActionType.DoubleClick |  PlanActionType.TripleClick | PlanActionType.MouseMove | PlanActionType.RightClick:
+                action_name = plan_action.action_type
+                x, y = grounding_result.position
+                parameters = {"position": [int(x), int(y)]}
+
+                if plan_action.action_type == PlanActionType.DoubleClick:
+                    action_name = SupportedActions.Click
+                    parameters["click_type"] = "double"
+                elif plan_action.action_type == PlanActionType.TripleClick:
+                    action_name = SupportedActions.Click
+                    parameters["click_type"] = "triple"
+                elif plan_action.action_type == PlanActionType.RightClick:
+                    action_name = SupportedActions.Click
+                    parameters["button"] = "right"
+                elif plan_action.action_type == PlanActionType.MouseMove:
+                    action_name = SupportedActions.MouseMove  # different names
+
+                assert action_name in [SupportedActions.Click, SupportedActions.MouseMove]
+                action = ComputerUseAction(
+                    name=action_name,
+                    description=plan_action.description,
+                    parameters=parameters,
+                )
+            case PlanActionType.Drag:
+                assert grounding_result.end_position is not None, "End position must be provided for drag action"
+                x, y = grounding_result.position
+                x_end, y_end = grounding_result.end_position
+                x, y = int(x), int(y)
+                x_end, y_end = int(x_end), int(y_end)
+                action = ComputerUseAction(
+                    name=SupportedActions.Drag,
+                    description=plan_action.description,
+                    parameters={"path": [{"x": x, "y": y}, {"x": x_end, "y": y_end}]},
+                )
+            case PlanActionType.Scroll:
+                x, y = grounding_result.position
+                x, y = int(x), int(y)
+                # guess the scroll direction if missing in the plan output
+                if "direction" not in plan_action.parameters:
+                    if "scroll up" in plan_action.description.lower():
+                        scroll_direction = "up"
+                    else:
+                        scroll_direction = "down"
+                else:
+                    scroll_direction = plan_action.parameters["direction"]
+
+                action = ComputerUseAction(
+                    name=SupportedActions.Scroll, description=plan_action.description, parameters={"position": [x, y], "direction": scroll_direction}
+                )
+
+                if "distance" in plan_action.parameters:
+                    match scroll_direction:
+                        case "up":
+                            action.parameters["offset"] = [0, plan_action.parameters["distance"]]
+                        case "down":
+                            action.parameters["offset"] = [0, -plan_action.parameters["distance"]]
+                        case "left":
+                            action.parameters["offset"] = [plan_action.parameters["distance"], 0]
+                        case "right":
+                            action.parameters["offset"] = [-plan_action.parameters["distance"], 0]
+            case PlanActionType.Type:
+                action = ComputerUseAction(
+                    name=SupportedActions.TypeInto,
+                    description=plan_action.description,
+                    parameters={"value": plan_action.parameters["text"]},
+                )
+
+        return action
+
+    async def predict(
+        self, state: State, execution_state: ExecutionState, max_retries: int = 0, planer_output: PlannerOutput | None = None
+    ) -> tuple[dict, dict]:
+        execute_planning = True
+        is_planning_fixed = planer_output is not None
+        execution_count = 0
+        execution_state.execution_info.responses = []
+        while execute_planning:
+            try:
+                execution_count += 1
+                if execution_state.execution_info.current_response is not None:
+                    execution_state.execution_info.responses.append(execution_state.execution_info.current_response)
+                execution_state.execution_info.current_response = utils.RawAgentResponse()
+                if not is_planning_fixed:
+                    planer_output = await self.planner.predict(state, execution_state)
+                plan_action = planer_output.plan_action
+
+                step = await self.process_plan_and_ground(planer_output, state, execution_state, retry_number=max_retries)
+                execute_planning = False
+            except utils.GroundingOutputValidationException as e:
+                execution_state.execution_info.current_response.grounding_error = e
+                if is_planning_fixed or execution_count > max_retries:
+                    raise ValueError(f"Grounding error with fixed plan: {e.message}, element description: {e.element_description}")
+
+        # save additional data for history
+        assert step is not None
+        assert step.additional_parameters is not None
+        step.additional_parameters["thought"] = planer_output.thought
+        step.additional_parameters["review"] = planer_output.review
+        step.additional_parameters.update(planer_output.additional_sections)
+        step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())
+
+        history_image = state.image_base64
+        previous_steps_parameters = {
+            "max_chat_history_messages": 1000,
+            "max_chat_history_images": 1,
+            "image": history_image,
+        }
+        agent_response = {"step": step.to_response_dict(), "previous_steps_parameters": previous_steps_parameters}
+
+        return agent_response
+
+    async def process_plan_and_ground(
+        self, planer_output: PlannerOutput, state: State, execution_state: ExecutionState, retry_number: int = 0
+    ) -> ComputerUseStep:
+        plan_action = planer_output.plan_action
+        action: ComputerUseAction | None = None
+        step: ComputerUseStep | None = None
+
+        match plan_action.action_type:
            case PlanActionType.ExtractData:
                # return a step with no action, just to store the extracted data
                step = ComputerUseStep(
@@ -164,35 +189,29 @@ class UiPathComputerUseV1(object):
                | PlanActionType.Scroll
                | PlanActionType.Drag
                | PlanActionType.DoubleClick
+                | PlanActionType.TripleClick
                | PlanActionType.RightClick
            ):
                if plan_action.action_type != PlanActionType.Drag:
+                    element_description = plan_action.parameters.get("element_description", None)
                    grounding_result = await self.executor.predict(
                        state.image_base64,
                        plan_action.description,
                        action=plan_action.action_type,
+                        element_description=element_description
                    )
                else:
-                    grounding_result = await self.executor.predict(
-                        state.image_base64,
-                        plan_action.parameters["start_description"],
-                        action=plan_action.action_type,
-                    )
-                    grounding_result_end = await self.executor.predict(
-                        state.image_base64,
-                        plan_action.parameters["end_description"],
-                        action=plan_action.action_type,
-                    )
-                    grounding_result.end_position = grounding_result_end.position
-                x, y = grounding_result.position
-                action = self.process_grounding(plan_action, grounding_result, x, y)
-            case PlanActionType.Type:
-                action = ComputerUseAction(
-                    name=SupportedActions.TypeInto,
-                    description=plan_action.description,
-                    parameters={"value": plan_action.parameters["text"]},
-                )
-
+                    start_description = plan_action.parameters.get("start_description", None)
+                    end_description = plan_action.parameters.get("end_description", None)
+                    drag_entire_description = plan_action.description
+                    drag_start_description = f"Drag Start point:{start_description}. [Full Drag Description:{drag_entire_description}]"
+                    drag_end_description = f"Drag End point:{end_description}. [Full Drag Description:{drag_entire_description}]"
+                    grounding_result = await self.executor.predict(state.image_base64, drag_start_description, action=plan_action.action_type)
+                    grounding_result_end = await self.executor.predict(state.image_base64, drag_end_description, action=plan_action.action_type)
+                    grounding_result.end_position = grounding_result_end.get_point_location()
+                action = self.wrap_to_computer_use_action(plan_action, grounding_result)
+            case _:
+                action = self.wrap_to_computer_use_action(plan_action, grounding_result=None)
        if step is None:
            assert action is not None
            step = ComputerUseStep(
@@ -202,22 +221,4 @@ class UiPathComputerUseV1(object):
                thought=planer_output.thought,
            )

-        # save additional data for history
-        assert step.additional_parameters is not None
-        step.additional_parameters["thought"] = planer_output.thought
-        step.additional_parameters["review"] = planer_output.review
-        step.additional_parameters.update(planer_output.additional_sections)
-        step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())
-
-        history_image = state.image_base64
-        previous_steps_parameters = {
-            "max_chat_history_messages": 1000,
-            "max_chat_history_images": self.planner.number_history_steps_with_images,
-            "image": history_image,
-        }
-        agent_response = {
-            "step": step.to_response_dict(),
-            "previous_steps_parameters": previous_steps_parameters,
-        }
-
-        return agent_response
+        return step
--- a/mm_agents/uipath/grounder_client.py
+++ b/mm_agents/uipath/grounder_client.py
@@ -4,21 +4,20 @@ import os

 class GrounderClient(object):
    def __init__(self):
-        # Proxy for hosting UI-TARS + UiElementPredictor
-        # Could be replaced with a VLLM server and grounder (UI-TARS) specific processing
-        # Or any other grounder 
+        # Proxy for hosting finetuned Qwen3VL + UiElementPredictor
+        # Could be replaced with a VLLM server and grounder specific processing
        self.url = ""

    async def predict(
-        self, image_base64: str, action_description: str, action: str | None = None
+        self, image_base64: str, action_description: str, action: str, element_description: str | None = None,
    ) -> utils.GroundingOutput:
        request = utils.GroundingRequest(
            description=action_description,
            image_base64=image_base64,
            action_type=action,
+            element_description=element_description
        )
        api_key = os.getenv("SERVICE_KEY")
-
        async with httpx.AsyncClient() as client:
            response = await client.post(
                self.url,
@@ -26,6 +25,7 @@ class GrounderClient(object):
                    "image_base64": request.image_base64,
                    "action_description": request.description,
                    "action": request.action_type,
+                    "element_description": request.element_description,
                },
                headers={
                    "X-API-KEY": api_key
@@ -37,6 +37,8 @@ class GrounderClient(object):
            raise ValueError(f"Prediction failed: {response.text}")

        data = response.json()
+        if tuple(data["position"]) == (-1, -1):
+            raise utils.GroundingOutputValidationException(f"Element {request.description} not found in image", request.description)
        return utils.GroundingOutput(
            description=data["description"],
            position=tuple(data["position"]),
--- a/mm_agents/uipath/llm_client.py
+++ b/mm_agents/uipath/llm_client.py
@@ -5,7 +5,6 @@ def send_messages(payload):
    # URL to your proxy for calling LLMs
    proxy_url = ""
    api_key = os.getenv("SERVICE_KEY")
-    
    # Can be directly replaced with code for calling Azure endpoint as in:
    #.env config example :
    # AZURE_OPENAI_API_BASE=YOUR_API_BASE
@@ -40,5 +39,5 @@ def send_messages(payload):
    for attempt in range(retries):
        response = requests.post(proxy_url, headers=headers, json=payload)
        if response.status_code == 200:
-            return response.json()["choices"][0]["message"]["content"]
+            return response.text
    return None
--- a/mm_agents/uipath/memory.py
+++ b/mm_agents/uipath/memory.py
@@ -0,0 +1,105 @@
+import json
+from enum import Enum
+
+from mm_agents.uipath.utils import ValidationException, parse_message_json, ExecutionInfo
+from mm_agents.uipath.types_utils import ExecutionState, State
+
+memory_system_template = """You also have a SHORT TERM MEMORY that stores only data  about the task. It is NOT a log of mechanical UI interactions. Use it to:
+- Keep track of items that need to be processed as part of the task
+- store only information that might be useful later in the task
+- DO NOT store information which can be easily inferered from the task description
+Never record: scrolling, mouse movement / hover, focusing an input (unless it results in a committed value change), transient pop-ups you just closed, partial / intermediate typed characters, pure navigation clicks that do not yield a new verifiable state.
+Memory supports only the following operations emitted as a LIST of JSON objects (empty list if no update):
+- store_info  # add or update information related to the task in memory
+{{
+    "key": str, # the info key, must be unique
+    "info_type": Literal["data_update", "queue_elements"], 
+    # data_update: different data related to the task
+    # queue_elements: list of items to be processed in the task
+    "value": str|json, 
+    "description": str # Short human-readable description of the update (what changed and why it matters)
+   
+ }} 
+- delete_info {{"key": str, "description": str}} - delete information from memory by key
+Example: [{{"type": "store_info", "info_type": "queue_elements", "key": "scripts_to_be_executed", "value": "[script.py, script2.py, script3.py]", "description": "List of scripts that need to be executed as part of the task"}}]
+"""
+
+
+class EnumMemoryOperationType(str, Enum):
+    StoreInfo = "store_info"
+    DeleteInfo = "delete_info"
+    NoOp = "no_op"
+
+
+class MemoryOperation(object):
+    def __init__(
+        self,
+        operation_type: str,
+        key: str | None = None,
+        value: str | dict | None = None,
+        description: str | None = None,
+        info_type: str | None = None,
+    ):
+        self.operation_type = operation_type
+        self.key = key
+        self.value = value
+        self.description = description
+        self.info_type = info_type
+
+    @staticmethod
+    def from_dict(data: dict) -> "MemoryOperation":
+        operation_type = data.get("type", "").lower()
+
+        if data.get("info_type", None) is not None or data.get("value", None) is not None:
+            operation_type = EnumMemoryOperationType.StoreInfo
+
+        if operation_type not in (EnumMemoryOperationType.StoreInfo, EnumMemoryOperationType.DeleteInfo, EnumMemoryOperationType.NoOp):
+            raise ValidationException(f"Invalid memory operation type: {operation_type}")
+
+        if operation_type == EnumMemoryOperationType.StoreInfo:
+            if "key" not in data or "value" not in data:
+                raise ValidationException("StoreInfo operation requires 'key' and 'value'")
+
+        key = data.get("key", None)
+        value = data.get("value", None)
+        description = data.get("description", None)
+        info_type = data.get("info_type", None)
+        return MemoryOperation(operation_type, key, value, description, info_type)
+
+
+class ShortTermMemoryManager:
+    async def get_updated_memory(
+        self, state: State, memory_operations: list[MemoryOperation], execution_state: ExecutionState
+    ) -> tuple[dict[str, dict[str, str]], list[str]]:
+        current_memory = json.loads(state.previous_steps[-1]["additional_parameters"].get("memory", "{}")) if len(state.previous_steps) > 0 else {}
+
+        for i, memory_operation in enumerate(memory_operations):
+            if memory_operation.operation_type == EnumMemoryOperationType.StoreInfo:
+                if "data" not in current_memory:
+                    current_memory["data"] = {}
+                data_memory = current_memory["data"]
+
+                if memory_operation.key is None or memory_operation.value is None:
+                    raise ValidationException("StoreInfo operation requires 'key' and 'value'")
+                if memory_operation.key not in data_memory:
+                    data_memory[memory_operation.key] = {}
+                data_memory[memory_operation.key]["value"] = memory_operation.value
+                data_memory[memory_operation.key]["description"] = memory_operation.description
+                data_memory[memory_operation.key]["info_type"] = memory_operation.info_type
+            elif memory_operation.operation_type == EnumMemoryOperationType.DeleteInfo:
+                data_memory = current_memory.get("data", {})
+                data_memory.pop(memory_operation.key, None)
+            elif memory_operation.operation_type == EnumMemoryOperationType.NoOp:
+                pass
+        return current_memory
+
+    def extract_memory_operations(self, memory_response: str | None) -> list[MemoryOperation]:
+        if isinstance(memory_response, str):
+            try:
+                memory_response = json.loads(memory_response)
+            except Exception as e:
+                raise ValidationException(f"Invalid memory format, cannot parse JSON: {memory_response}. Error: {e}")
+
+        memory_operations = [MemoryOperation.from_dict(item) for item in memory_response]
+
+        return memory_operations
--- a/mm_agents/uipath/types_utils.py
+++ b/mm_agents/uipath/types_utils.py
@@ -1,5 +1,6 @@
 from typing import Optional, Union, List
 from enum import Enum
+from mm_agents.uipath.utils import ExecutionInfo

 key_maps = {
    "Backspace": "Back",
@@ -21,6 +22,7 @@ key_maps = {
 class PlanActionType(str, Enum):
    Click = "click"
    DoubleClick = "double_click"
+    TripleClick = "triple_click"
    RightClick = "right_click"
    Type = "type"
    Scroll = "scroll"
@@ -189,6 +191,6 @@ class State(object):


 class ExecutionState(object):
-    def __init__(self, model_name: str, execution_info: dict):
+    def __init__(self, model_name: str):
        self.model_name = model_name
-        self.execution_info = execution_info
+        self.execution_info = ExecutionInfo()
--- a/mm_agents/uipath/utils.py
+++ b/mm_agents/uipath/utils.py
@@ -1,14 +1,32 @@
 import json
 import re

+from typing import Optional
 from json_minify import json_minify
 from json_repair import repair_json
-
+from dataclasses import dataclass, field

 class ValidationException(Exception):
    def __init__(self, message: str):
        self.message = message

+class GroundingOutputValidationException(ValidationException):
+    def __init__(self, message: str, element_description: str, raw_response: str | None = None):
+        super().__init__(message)
+        self.message = message
+        self.element_description = element_description
+        self.raw_response = raw_response
+    
+@dataclass
+class RawAgentResponse:
+    raw_planning_prediction: str | None = None
+    grounding_error: Optional[GroundingOutputValidationException] = None
+
+
+class ExecutionInfo:
+    planner_action_review: Optional[dict] = None
+    responses: list[RawAgentResponse] = field(default_factory=list)  # can contain both planning and grounding raw responses
+    current_response: Optional[RawAgentResponse] = None

 def parse_message_json(message: str) -> dict:
    message = message.strip()
@@ -46,12 +64,20 @@ class GroundingOutput:
        self.description = description
        self.position = position
        self.end_position = end_position
-
+    
+    def get_point_location(self) -> tuple[int, int]:
+        if self.position is None:
+            x1, y1, x2, y2 = self.bbox
+            x, y = (x1 + x2) // 2, (y1 + y2) // 2
+        else:
+            x, y = self.position
+        return x, y

 class GroundingRequest:
    def __init__(
-        self, description: str, image_base64: str, action_type: str | None = None
+        self, description: str, image_base64: str, action_type: str | None = None, element_description: str | None = None
    ):
        self.description = description
        self.image_base64 = image_base64
        self.action_type = action_type
+        self.element_description = element_description
--- a/mm_agents/uipath_agent.py
+++ b/mm_agents/uipath_agent.py
@@ -73,7 +73,7 @@ def map_uipath_agent_actions_to_osworld(actions):
            if params["click_type"] == "double":
                return {"action_type": "DOUBLE_CLICK", "x": x, "y": y}
            elif params["click_type"] == "triple":
-                return {"action_type": "TRIPLE_CLICK", "x": x, "y": y}
+                return {"action_type": "CLICK", "x": x, "y": y, "num_clicks": 3}
            else:
                raise ValueError(f"Unknown click type: {params['click_type']}")
        else:
@@ -165,23 +165,17 @@ class UipathBaseAgent:
            {
                "actions": rsp["step"]["actions"],
                "description": rsp["step"]["description"],
-                "additional_parameters": {
-                    "review": rsp["step"]["additional_parameters"]["review"],
-                    "thought": rsp["step"]["additional_parameters"]["thought"],
-                    "action_description": rsp["step"]["additional_parameters"][
-                        "action_description"
-                    ],
-                    "plan_action": rsp["step"]["additional_parameters"]["plan_action"],
-                },
+                "additional_parameters": rsp['step']['additional_parameters'],
                "image": img_base64,
            }
        )

    def predict(self, instruction: str, obs: Dict, args, step_idx) -> List:
-        if step_idx == args.max_steps - 1:
+        if step_idx >= args.max_steps - 1:
            message = (
-                instruction
-                + "The sudo password is password, if needed. This is the last step, you must return the finish actions with either success or failure, depending on the result. No further steps are allowed."
+                instruction + """You have reached the final step of the process.
+At this point, no further actions can be taken - it may therefore be impossible to complete the task successfully.
+Conclude by returning a finish action with success or failure, depending on what can be determined from the current state."""
            )
        else:
            message = instruction + "The sudo password is password, if needed."
@@ -235,4 +229,4 @@ class UipathBaseAgent:
        self.thoughts = []
        self.actions = []
        self.observations = []
-        self.uipath_hist = []
+        self.uipath_hist = []
--- a/run_multienv_uipath.py
+++ b/run_multienv_uipath.py
@@ -258,7 +258,11 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
                    except Exception as rec_e:
                        logger.error(f"Failed to end recording: {rec_e}")
                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
-                        f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"}))
+                        tb = traceback.format_exc()
+                        f.write(json.dumps({
+                            "Error": f"{domain}/{example_id} - {e}",
+                            "Traceback": tb
+                        }))
                        f.write("\n")
            except Exception as e:
                logger.error(f"Task-level error in {current_process().name}: {e}")
@@ -557,4 +561,4 @@ if __name__ == "__main__":
                    os.kill(p.pid, signal.SIGKILL)
                    logger.info(f"Process {p.name} force killed")
                except Exception as e:
-                    logger.error(f"Error force killing process: {e}")
+                    logger.error(f"Error force killing process: {e}")