Files
sci-gui-agent-benchmark/mm_agents/uipath_agent.py
alexandruilie7 5463d3bb89 uipath v2 (#413)
* submission v2

* small updates
2026-01-09 08:47:20 +08:00

232 lines
8.1 KiB
Python

import base64
import json
from typing import Dict, List
import re
import asyncio
import logging
from mm_agents.uipath.agent import UiPathComputerUseV1
def parse_actions_from_string(input_string):
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
return [input_string.strip()]
actions = []
matches = re.findall(r"```json\s+(.*?)\s+```", input_string, re.DOTALL)
if matches:
try:
for match in matches:
action_dict = json.loads(match)
actions.append(action_dict)
return actions
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
matches = re.findall(r"```\s+(.*?)\s+```", input_string, re.DOTALL)
if matches:
try:
for match in matches:
action_dict = json.loads(match)
actions.append(action_dict)
return actions
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
try:
action_dict = json.loads(input_string)
return [action_dict]
except json.JSONDecodeError:
raise ValueError("Invalid response format: " + input_string)
def map_key(key):
key = key.lower()
if key == "space":
key = " "
elif key == "back":
key = "backspace"
elif key == "super":
key = "win"
elif key == "arrowdown":
key = "down"
elif key == "arrowup":
key = "up"
elif key == "arrowright":
key = "right"
elif key == "arrowrleft":
key = "left"
return key
def map_uipath_agent_actions_to_osworld(actions):
results = []
def handle_click(params):
x, y = tuple(params["position"])
if "button" in params:
if params["button"] == "right":
return {"action_type": "RIGHT_CLICK", "x": x, "y": y}
elif params["button"] == "left":
return {"action_type": "LEFT_CLICK", "x": x, "y": y}
else:
raise ValueError(f"Unknown click button: {params['button']}")
elif "click_type" in params:
if params["click_type"] == "double":
return {"action_type": "DOUBLE_CLICK", "x": x, "y": y}
elif params["click_type"] == "triple":
return {"action_type": "CLICK", "x": x, "y": y, "num_clicks": 3}
else:
raise ValueError(f"Unknown click type: {params['click_type']}")
else:
return {"action_type": "CLICK", "x": x, "y": y}
def handle_keypress(params):
keys = [map_key(k) for k in params["keys"]]
if len(keys) == 1:
return {"action_type": "PRESS", "key": keys[0]}
return {"action_type": "HOTKEY", "keys": keys}
def handle_key_event(params, event_type):
key = map_key(params["keys"][0])
return {"action_type": event_type, "key": key}
for action in actions:
method = action["method_type"].lower()
params = action["parameters"]
match method:
case "click":
result = handle_click(params)
case "type_into":
result = {"action_type": "TYPING", "text": params["value"]}
case "wait_load_completed":
result = "WAIT"
case "keypress":
result = handle_keypress(params)
case "keydown":
result = handle_key_event(params, "KEY_DOWN")
case "keypup":
result = handle_key_event(params, "KEY_UP")
case "finish":
status_map = {"failure": "FAIL", "success": "DONE"}
result = status_map.get(params.get("status"), "DONE")
case "scroll":
x, y = tuple(params["position"])
if "offset" in params:
dx, dy = tuple(params["offset"])
else:
dy = 5 if params["direction"] == "up" else -5
dx = 5 if params["direction"] == "left" else -5
result = [
{"action_type": "MOVE_TO", "x": x, "y": y},
{"action_type": "SCROLL", "dx": dx, "dy": dy},
]
case "mouse_move":
x, y = tuple(params["position"])
result = {"action_type": "MOVE_TO", "x": x, "y": y}
case "drag":
path = params["path"]
x1, y1 = path[0]["x"], path[0]["y"]
x2, y2 = path[1]["x"], path[1]["y"]
result = [
{"action_type": "MOVE_TO", "x": x1, "y": y1},
{"action_type": "DRAG_TO", "x": x2, "y": y2},
]
case _:
raise ValueError(f"Unknown method type: {method}")
results.append(result)
return json.dumps(results)
class UipathBaseAgent:
def __init__(
self,
platform="ubuntu",
model="gpt-5-mini-2025-08-07",
action_space="computer_13",
observation_type="screenshot",
client_password="password",
):
self.platform = platform
self.model = model
self.action_space = action_space
self.observation_type = observation_type
self.client_password = client_password
self.uipath_computer_use_model = UiPathComputerUseV1()
self.thoughts = []
self.actions = []
self.observations = []
self.uipath_hist = []
def update_history(self, rsp, img_base64):
self.uipath_hist.append(
{
"actions": rsp["step"]["actions"],
"description": rsp["step"]["description"],
"additional_parameters": rsp['step']['additional_parameters'],
"image": img_base64,
}
)
def predict(self, instruction: str, obs: Dict, args, step_idx) -> List:
if step_idx >= args.max_steps - 1:
message = (
instruction + """You have reached the final step of the process.
At this point, no further actions can be taken - it may therefore be impossible to complete the task successfully.
Conclude by returning a finish action with success or failure, depending on what can be determined from the current state."""
)
else:
message = instruction + "The sudo password is password, if needed."
img_base64 = base64.b64encode(obs["screenshot"]).decode("utf-8")
payload = {
"previousSteps": self.uipath_hist,
"userTask": message,
"image": img_base64,
"model_name": args.uipath_model_name,
}
rsp = asyncio.run(
self.uipath_computer_use_model.predict_request(
payload, args.uipath_model_name
)
)
self.update_history(rsp, img_base64)
uipath_actions = map_uipath_agent_actions_to_osworld(rsp["step"]["actions"])
try:
actions = self.parse_actions(uipath_actions)
self.thoughts.append(rsp)
except ValueError as e:
print("Failed to parse action from response", e)
actions = None
self.thoughts.append("")
if len(actions) != 0:
while actions and isinstance(actions[0], list):
actions = [
action for multi_action in actions for action in multi_action
]
return rsp["step"], actions
def parse_actions(self, response: str, masks=None):
if self.observation_type in ["screenshot"]:
if self.action_space == "computer_13":
actions = parse_actions_from_string(response)
else:
raise ValueError("Invalid action space: " + self.action_space)
self.actions.append(actions)
return actions
else:
raise ValueError("Invalid observation type: " + self.action_space)
def reset(self, _logger=None):
global logger
logger = (
_logger if _logger is not None else logging.getLogger("desktopenv.agent")
)
self.thoughts = []
self.actions = []
self.observations = []
self.uipath_hist = []