uipath v2 (#413)

* submission v2 * small updates
2026-01-09 02:47:20 +02:00
parent 5ef8bdfa35
commit 5463d3bb89
11 changed files with 643 additions and 425 deletions
--- a/mm_agents/uipath/agent.py
+++ b/mm_agents/uipath/agent.py
@@ -19,113 +19,19 @@ class UiPathComputerUseV1(object):
        self.planner = ActionPlanner()
        self.executor = GrounderClient()

-    async def predict_request(
-        self, request_body: dict, model_name: str
-    ) -> tuple[dict, dict]:
+    async def predict_request(self, request_body: dict, model_name: str) -> tuple[dict, dict]:
+        previous_steps = request_body['previousSteps'] if request_body['previousSteps'] else []
        state = State(
            task=request_body["userTask"],
            image_base64=request_body["image"],
-            previous_steps=request_body.get("previousSteps", []),
+            previous_steps=[step for step in previous_steps],
        )

-        execution_state = ExecutionState(model_name=model_name, execution_info={})
-        output = await self.predict(state, execution_state)
+        execution_state = ExecutionState(model_name=model_name)
+        output = await self.predict(state, execution_state, max_retries=2)
        return output

-    def process_grounding(
-        self,
-        plan_action: PlanAction,
-        grounding_result: utils.GroundingOutput,
-        x: int,
-        y: int,
-    ):
-        match plan_action.action_type:
-            case PlanActionType.Scroll:
-                # guess the scroll direction if missing in the plan output
-                if "direction" not in plan_action.parameters:
-                    if "scroll up" in plan_action.description.lower():
-                        scroll_direction = "up"
-                    else:
-                        scroll_direction = "down"
-                else:
-                    scroll_direction = plan_action.parameters["direction"]
-
-                action = ComputerUseAction(
-                    name=SupportedActions.Scroll,
-                    description=plan_action.description,
-                    parameters={"position": [x, y], "direction": scroll_direction},
-                )
-
-                if "distance" in plan_action.parameters:
-                    match scroll_direction:
-                        case "up":
-                            action.parameters["offset"] = [
-                                0,
-                                plan_action.parameters["distance"],
-                            ]
-                        case "down":
-                            action.parameters["offset"] = [
-                                0,
-                                -plan_action.parameters["distance"],
-                            ]
-                        case "left":
-                            action.parameters["offset"] = [
-                                plan_action.parameters["distance"],
-                                0,
-                            ]
-                        case "right":
-                            action.parameters["offset"] = [
-                                -plan_action.parameters["distance"],
-                                0,
-                            ]
-            case PlanActionType.Drag:
-                assert grounding_result.end_position is not None, (
-                    "End position must be provided for drag action"
-                )
-                x_end, y_end = grounding_result.end_position
-                action = ComputerUseAction(
-                    name=SupportedActions.Drag,
-                    description=plan_action.description,
-                    parameters={
-                        "path": [
-                            {"x": x, "y": y},
-                            {"x": x_end, "y": y_end},
-                        ]
-                    },
-                )
-            case _:
-                action_name = plan_action.action_type
-                parameters = {"position": [x, y]}
-
-                if plan_action.action_type == PlanActionType.DoubleClick:
-                    action_name = SupportedActions.Click
-                    parameters["click_type"] = "double"
-                elif plan_action.action_type == PlanActionType.RightClick:
-                    action_name = SupportedActions.Click
-                    parameters["button"] = "right"
-                elif plan_action.action_type == PlanActionType.MouseMove:
-                    action_name = SupportedActions.MouseMove  # different names
-
-                assert action_name in [
-                    SupportedActions.Click,
-                    SupportedActions.MouseMove,
-                ]
-                action = ComputerUseAction(
-                    name=action_name,
-                    description=plan_action.description,
-                    parameters=parameters,
-                )
-        return action
-
-    async def predict(
-        self, state: State, execution_state: ExecutionState
-    ) -> tuple[dict, dict]:
-        planer_output: PlannerOutput = self.planner.predict(state, execution_state)
-        plan_action = planer_output.plan_action
-
-        action: ComputerUseAction | None = None
-        step: ComputerUseStep | None = None
-
+    def wrap_to_computer_use_action(self, plan_action: PlanAction, grounding_result: utils.GroundingOutput | None) -> ComputerUseAction:
        match plan_action.action_type:
            case PlanActionType.KeyPress:
                keys = plan_action.parameters["key"].split(" ")
@@ -142,6 +48,125 @@ class UiPathComputerUseV1(object):
                    description=plan_action.description,
                    parameters={},
                )
+            case PlanActionType.Click | PlanActionType.DoubleClick |  PlanActionType.TripleClick | PlanActionType.MouseMove | PlanActionType.RightClick:
+                action_name = plan_action.action_type
+                x, y = grounding_result.position
+                parameters = {"position": [int(x), int(y)]}
+
+                if plan_action.action_type == PlanActionType.DoubleClick:
+                    action_name = SupportedActions.Click
+                    parameters["click_type"] = "double"
+                elif plan_action.action_type == PlanActionType.TripleClick:
+                    action_name = SupportedActions.Click
+                    parameters["click_type"] = "triple"
+                elif plan_action.action_type == PlanActionType.RightClick:
+                    action_name = SupportedActions.Click
+                    parameters["button"] = "right"
+                elif plan_action.action_type == PlanActionType.MouseMove:
+                    action_name = SupportedActions.MouseMove  # different names
+
+                assert action_name in [SupportedActions.Click, SupportedActions.MouseMove]
+                action = ComputerUseAction(
+                    name=action_name,
+                    description=plan_action.description,
+                    parameters=parameters,
+                )
+            case PlanActionType.Drag:
+                assert grounding_result.end_position is not None, "End position must be provided for drag action"
+                x, y = grounding_result.position
+                x_end, y_end = grounding_result.end_position
+                x, y = int(x), int(y)
+                x_end, y_end = int(x_end), int(y_end)
+                action = ComputerUseAction(
+                    name=SupportedActions.Drag,
+                    description=plan_action.description,
+                    parameters={"path": [{"x": x, "y": y}, {"x": x_end, "y": y_end}]},
+                )
+            case PlanActionType.Scroll:
+                x, y = grounding_result.position
+                x, y = int(x), int(y)
+                # guess the scroll direction if missing in the plan output
+                if "direction" not in plan_action.parameters:
+                    if "scroll up" in plan_action.description.lower():
+                        scroll_direction = "up"
+                    else:
+                        scroll_direction = "down"
+                else:
+                    scroll_direction = plan_action.parameters["direction"]
+
+                action = ComputerUseAction(
+                    name=SupportedActions.Scroll, description=plan_action.description, parameters={"position": [x, y], "direction": scroll_direction}
+                )
+
+                if "distance" in plan_action.parameters:
+                    match scroll_direction:
+                        case "up":
+                            action.parameters["offset"] = [0, plan_action.parameters["distance"]]
+                        case "down":
+                            action.parameters["offset"] = [0, -plan_action.parameters["distance"]]
+                        case "left":
+                            action.parameters["offset"] = [plan_action.parameters["distance"], 0]
+                        case "right":
+                            action.parameters["offset"] = [-plan_action.parameters["distance"], 0]
+            case PlanActionType.Type:
+                action = ComputerUseAction(
+                    name=SupportedActions.TypeInto,
+                    description=plan_action.description,
+                    parameters={"value": plan_action.parameters["text"]},
+                )
+
+        return action
+
+    async def predict(
+        self, state: State, execution_state: ExecutionState, max_retries: int = 0, planer_output: PlannerOutput | None = None
+    ) -> tuple[dict, dict]:
+        execute_planning = True
+        is_planning_fixed = planer_output is not None
+        execution_count = 0
+        execution_state.execution_info.responses = []
+        while execute_planning:
+            try:
+                execution_count += 1
+                if execution_state.execution_info.current_response is not None:
+                    execution_state.execution_info.responses.append(execution_state.execution_info.current_response)
+                execution_state.execution_info.current_response = utils.RawAgentResponse()
+                if not is_planning_fixed:
+                    planer_output = await self.planner.predict(state, execution_state)
+                plan_action = planer_output.plan_action
+
+                step = await self.process_plan_and_ground(planer_output, state, execution_state, retry_number=max_retries)
+                execute_planning = False
+            except utils.GroundingOutputValidationException as e:
+                execution_state.execution_info.current_response.grounding_error = e
+                if is_planning_fixed or execution_count > max_retries:
+                    raise ValueError(f"Grounding error with fixed plan: {e.message}, element description: {e.element_description}")
+
+        # save additional data for history
+        assert step is not None
+        assert step.additional_parameters is not None
+        step.additional_parameters["thought"] = planer_output.thought
+        step.additional_parameters["review"] = planer_output.review
+        step.additional_parameters.update(planer_output.additional_sections)
+        step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())
+
+        history_image = state.image_base64
+        previous_steps_parameters = {
+            "max_chat_history_messages": 1000,
+            "max_chat_history_images": 1,
+            "image": history_image,
+        }
+        agent_response = {"step": step.to_response_dict(), "previous_steps_parameters": previous_steps_parameters}
+
+        return agent_response
+
+    async def process_plan_and_ground(
+        self, planer_output: PlannerOutput, state: State, execution_state: ExecutionState, retry_number: int = 0
+    ) -> ComputerUseStep:
+        plan_action = planer_output.plan_action
+        action: ComputerUseAction | None = None
+        step: ComputerUseStep | None = None
+
+        match plan_action.action_type:
            case PlanActionType.ExtractData:
                # return a step with no action, just to store the extracted data
                step = ComputerUseStep(
@@ -164,35 +189,29 @@ class UiPathComputerUseV1(object):
                | PlanActionType.Scroll
                | PlanActionType.Drag
                | PlanActionType.DoubleClick
+                | PlanActionType.TripleClick
                | PlanActionType.RightClick
            ):
                if plan_action.action_type != PlanActionType.Drag:
+                    element_description = plan_action.parameters.get("element_description", None)
                    grounding_result = await self.executor.predict(
                        state.image_base64,
                        plan_action.description,
                        action=plan_action.action_type,
+                        element_description=element_description
                    )
                else:
-                    grounding_result = await self.executor.predict(
-                        state.image_base64,
-                        plan_action.parameters["start_description"],
-                        action=plan_action.action_type,
-                    )
-                    grounding_result_end = await self.executor.predict(
-                        state.image_base64,
-                        plan_action.parameters["end_description"],
-                        action=plan_action.action_type,
-                    )
-                    grounding_result.end_position = grounding_result_end.position
-                x, y = grounding_result.position
-                action = self.process_grounding(plan_action, grounding_result, x, y)
-            case PlanActionType.Type:
-                action = ComputerUseAction(
-                    name=SupportedActions.TypeInto,
-                    description=plan_action.description,
-                    parameters={"value": plan_action.parameters["text"]},
-                )
-
+                    start_description = plan_action.parameters.get("start_description", None)
+                    end_description = plan_action.parameters.get("end_description", None)
+                    drag_entire_description = plan_action.description
+                    drag_start_description = f"Drag Start point:{start_description}. [Full Drag Description:{drag_entire_description}]"
+                    drag_end_description = f"Drag End point:{end_description}. [Full Drag Description:{drag_entire_description}]"
+                    grounding_result = await self.executor.predict(state.image_base64, drag_start_description, action=plan_action.action_type)
+                    grounding_result_end = await self.executor.predict(state.image_base64, drag_end_description, action=plan_action.action_type)
+                    grounding_result.end_position = grounding_result_end.get_point_location()
+                action = self.wrap_to_computer_use_action(plan_action, grounding_result)
+            case _:
+                action = self.wrap_to_computer_use_action(plan_action, grounding_result=None)
        if step is None:
            assert action is not None
            step = ComputerUseStep(
@@ -202,22 +221,4 @@ class UiPathComputerUseV1(object):
                thought=planer_output.thought,
            )

-        # save additional data for history
-        assert step.additional_parameters is not None
-        step.additional_parameters["thought"] = planer_output.thought
-        step.additional_parameters["review"] = planer_output.review
-        step.additional_parameters.update(planer_output.additional_sections)
-        step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())
-
-        history_image = state.image_base64
-        previous_steps_parameters = {
-            "max_chat_history_messages": 1000,
-            "max_chat_history_images": self.planner.number_history_steps_with_images,
-            "image": history_image,
-        }
-        agent_response = {
-            "step": step.to_response_dict(),
-            "previous_steps_parameters": previous_steps_parameters,
-        }
-
-        return agent_response
+        return step