sci-gui-agent-benchmark/mm_agents/uipath/agent.py

import json
from mm_agents.uipath.types_utils import (
    ComputerUseAction,
    ComputerUseStep,
    SupportedActions,
    PlanActionType,
    PlanAction,
    key_maps,
    ExecutionState,
    State,
)
import mm_agents.uipath.utils as utils
from mm_agents.uipath.action_planner import ActionPlanner, PlannerOutput
from mm_agents.uipath.grounder_client import GrounderClient


class UiPathComputerUseV1(object):
    def __init__(self):
        self.planner = ActionPlanner()
        self.executor = GrounderClient()

    async def predict_request(self, request_body: dict, model_name: str) -> tuple[dict, dict]:
        previous_steps = request_body['previousSteps'] if request_body['previousSteps'] else []
        state = State(
            task=request_body["userTask"],
            image_base64=request_body["image"],
            previous_steps=[step for step in previous_steps],
        )

        execution_state = ExecutionState(model_name=model_name)
        output = await self.predict(state, execution_state, max_retries=2)
        return output

    def wrap_to_computer_use_action(self, plan_action: PlanAction, grounding_result: utils.GroundingOutput | None) -> ComputerUseAction:
        match plan_action.action_type:
            case PlanActionType.KeyPress:
                keys = plan_action.parameters["key"].split(" ")
                keys = [key.strip() for key in keys]
                keys = [key_maps.get(key, key) for key in keys]
                action = ComputerUseAction(
                    name=SupportedActions.KeyPress,
                    description=plan_action.description,
                    parameters={"keys": keys},
                )
            case PlanActionType.Wait:
                action = ComputerUseAction(
                    name=SupportedActions.Wait,
                    description=plan_action.description,
                    parameters={},
                )
            case PlanActionType.Click | PlanActionType.DoubleClick |  PlanActionType.TripleClick | PlanActionType.MouseMove | PlanActionType.RightClick:
                action_name = plan_action.action_type
                x, y = grounding_result.position
                parameters = {"position": [int(x), int(y)]}

                if plan_action.action_type == PlanActionType.DoubleClick:
                    action_name = SupportedActions.Click
                    parameters["click_type"] = "double"
                elif plan_action.action_type == PlanActionType.TripleClick:
                    action_name = SupportedActions.Click
                    parameters["click_type"] = "triple"
                elif plan_action.action_type == PlanActionType.RightClick:
                    action_name = SupportedActions.Click
                    parameters["button"] = "right"
                elif plan_action.action_type == PlanActionType.MouseMove:
                    action_name = SupportedActions.MouseMove  # different names

                assert action_name in [SupportedActions.Click, SupportedActions.MouseMove]
                action = ComputerUseAction(
                    name=action_name,
                    description=plan_action.description,
                    parameters=parameters,
                )
            case PlanActionType.Drag:
                assert grounding_result.end_position is not None, "End position must be provided for drag action"
                x, y = grounding_result.position
                x_end, y_end = grounding_result.end_position
                x, y = int(x), int(y)
                x_end, y_end = int(x_end), int(y_end)
                action = ComputerUseAction(
                    name=SupportedActions.Drag,
                    description=plan_action.description,
                    parameters={"path": [{"x": x, "y": y}, {"x": x_end, "y": y_end}]},
                )
            case PlanActionType.Scroll:
                x, y = grounding_result.position
                x, y = int(x), int(y)
                # guess the scroll direction if missing in the plan output
                if "direction" not in plan_action.parameters:
                    if "scroll up" in plan_action.description.lower():
                        scroll_direction = "up"
                    else:
                        scroll_direction = "down"
                else:
                    scroll_direction = plan_action.parameters["direction"]

                action = ComputerUseAction(
                    name=SupportedActions.Scroll, description=plan_action.description, parameters={"position": [x, y], "direction": scroll_direction}
                )

                if "distance" in plan_action.parameters:
                    match scroll_direction:
                        case "up":
                            action.parameters["offset"] = [0, plan_action.parameters["distance"]]
                        case "down":
                            action.parameters["offset"] = [0, -plan_action.parameters["distance"]]
                        case "left":
                            action.parameters["offset"] = [plan_action.parameters["distance"], 0]
                        case "right":
                            action.parameters["offset"] = [-plan_action.parameters["distance"], 0]
            case PlanActionType.Type:
                action = ComputerUseAction(
                    name=SupportedActions.TypeInto,
                    description=plan_action.description,
                    parameters={"value": plan_action.parameters["text"]},
                )

        return action

    async def predict(
        self, state: State, execution_state: ExecutionState, max_retries: int = 0, planer_output: PlannerOutput | None = None
    ) -> tuple[dict, dict]:
        execute_planning = True
        is_planning_fixed = planer_output is not None
        execution_count = 0
        execution_state.execution_info.responses = []
        while execute_planning:
            try:
                execution_count += 1
                if execution_state.execution_info.current_response is not None:
                    execution_state.execution_info.responses.append(execution_state.execution_info.current_response)
                execution_state.execution_info.current_response = utils.RawAgentResponse()
                if not is_planning_fixed:
                    planer_output = await self.planner.predict(state, execution_state)
                plan_action = planer_output.plan_action

                step = await self.process_plan_and_ground(planer_output, state, execution_state, retry_number=max_retries)
                execute_planning = False
            except utils.GroundingOutputValidationException as e:
                execution_state.execution_info.current_response.grounding_error = e
                if is_planning_fixed or execution_count > max_retries:
                    raise ValueError(f"Grounding error with fixed plan: {e.message}, element description: {e.element_description}")

        # save additional data for history
        assert step is not None
        assert step.additional_parameters is not None
        step.additional_parameters["thought"] = planer_output.thought
        step.additional_parameters["review"] = planer_output.review
        step.additional_parameters.update(planer_output.additional_sections)
        step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())

        history_image = state.image_base64
        previous_steps_parameters = {
            "max_chat_history_messages": 1000,
            "max_chat_history_images": 1,
            "image": history_image,
        }
        agent_response = {"step": step.to_response_dict(), "previous_steps_parameters": previous_steps_parameters}

        return agent_response

    async def process_plan_and_ground(
        self, planer_output: PlannerOutput, state: State, execution_state: ExecutionState, retry_number: int = 0
    ) -> ComputerUseStep:
        plan_action = planer_output.plan_action
        action: ComputerUseAction | None = None
        step: ComputerUseStep | None = None

        match plan_action.action_type:
            case PlanActionType.ExtractData:
                # return a step with no action, just to store the extracted data
                step = ComputerUseStep(
                    description=plan_action.description,
                    actions=[],
                    additional_parameters={
                        "extracted_data": plan_action.parameters,
                    },
                    thought=planer_output.thought,
                )
            case PlanActionType.Finish:
                action = ComputerUseAction(
                    name=SupportedActions.Finish,
                    description=plan_action.description,
                    parameters=plan_action.parameters,
                )
            case (
                PlanActionType.Click
                | PlanActionType.MouseMove
                | PlanActionType.Scroll
                | PlanActionType.Drag
                | PlanActionType.DoubleClick
                | PlanActionType.TripleClick
                | PlanActionType.RightClick
            ):
                if plan_action.action_type != PlanActionType.Drag:
                    element_description = plan_action.parameters.get("element_description", None)
                    grounding_result = await self.executor.predict(
                        state.image_base64,
                        plan_action.description,
                        action=plan_action.action_type,
                        element_description=element_description
                    )
                else:
                    start_description = plan_action.parameters.get("start_description", None)
                    end_description = plan_action.parameters.get("end_description", None)
                    drag_entire_description = plan_action.description
                    drag_start_description = f"Drag Start point:{start_description}. [Full Drag Description:{drag_entire_description}]"
                    drag_end_description = f"Drag End point:{end_description}. [Full Drag Description:{drag_entire_description}]"
                    grounding_result = await self.executor.predict(state.image_base64, drag_start_description, action=plan_action.action_type)
                    grounding_result_end = await self.executor.predict(state.image_base64, drag_end_description, action=plan_action.action_type)
                    grounding_result.end_position = grounding_result_end.get_point_location()
                action = self.wrap_to_computer_use_action(plan_action, grounding_result)
            case _:
                action = self.wrap_to_computer_use_action(plan_action, grounding_result=None)
        if step is None:
            assert action is not None
            step = ComputerUseStep(
                description=plan_action.description,
                actions=[action],
                additional_parameters={},
                thought=planer_output.thought,
            )

        return step