import base64 import json from typing import Dict, List import re import asyncio import logging from mm_agents.uipath.agent import UiPathComputerUseV1 def parse_actions_from_string(input_string): if input_string.strip() in ["WAIT", "DONE", "FAIL"]: return [input_string.strip()] actions = [] matches = re.findall(r"```json\s+(.*?)\s+```", input_string, re.DOTALL) if matches: try: for match in matches: action_dict = json.loads(match) actions.append(action_dict) return actions except json.JSONDecodeError as e: return f"Failed to parse JSON: {e}" else: matches = re.findall(r"```\s+(.*?)\s+```", input_string, re.DOTALL) if matches: try: for match in matches: action_dict = json.loads(match) actions.append(action_dict) return actions except json.JSONDecodeError as e: return f"Failed to parse JSON: {e}" else: try: action_dict = json.loads(input_string) return [action_dict] except json.JSONDecodeError: raise ValueError("Invalid response format: " + input_string) def map_key(key): key = key.lower() if key == "space": key = " " elif key == "back": key = "backspace" elif key == "super": key = "win" elif key == "arrowdown": key = "down" elif key == "arrowup": key = "up" elif key == "arrowright": key = "right" elif key == "arrowrleft": key = "left" return key def map_uipath_agent_actions_to_osworld(actions): results = [] def handle_click(params): x, y = tuple(params["position"]) if "button" in params: if params["button"] == "right": return {"action_type": "RIGHT_CLICK", "x": x, "y": y} elif params["button"] == "left": return {"action_type": "LEFT_CLICK", "x": x, "y": y} else: raise ValueError(f"Unknown click button: {params['button']}") elif "click_type" in params: if params["click_type"] == "double": return {"action_type": "DOUBLE_CLICK", "x": x, "y": y} elif params["click_type"] == "triple": return {"action_type": "CLICK", "x": x, "y": y, "num_clicks": 3} else: raise ValueError(f"Unknown click type: {params['click_type']}") else: return {"action_type": "CLICK", "x": x, "y": y} def handle_keypress(params): keys = [map_key(k) for k in params["keys"]] if len(keys) == 1: return {"action_type": "PRESS", "key": keys[0]} return {"action_type": "HOTKEY", "keys": keys} def handle_key_event(params, event_type): key = map_key(params["keys"][0]) return {"action_type": event_type, "key": key} for action in actions: method = action["method_type"].lower() params = action["parameters"] match method: case "click": result = handle_click(params) case "type_into": result = {"action_type": "TYPING", "text": params["value"]} case "wait_load_completed": result = "WAIT" case "keypress": result = handle_keypress(params) case "keydown": result = handle_key_event(params, "KEY_DOWN") case "keypup": result = handle_key_event(params, "KEY_UP") case "finish": status_map = {"failure": "FAIL", "success": "DONE"} result = status_map.get(params.get("status"), "DONE") case "scroll": x, y = tuple(params["position"]) if "offset" in params: dx, dy = tuple(params["offset"]) else: dy = 5 if params["direction"] == "up" else -5 dx = 5 if params["direction"] == "left" else -5 result = [ {"action_type": "MOVE_TO", "x": x, "y": y}, {"action_type": "SCROLL", "dx": dx, "dy": dy}, ] case "mouse_move": x, y = tuple(params["position"]) result = {"action_type": "MOVE_TO", "x": x, "y": y} case "drag": path = params["path"] x1, y1 = path[0]["x"], path[0]["y"] x2, y2 = path[1]["x"], path[1]["y"] result = [ {"action_type": "MOVE_TO", "x": x1, "y": y1}, {"action_type": "DRAG_TO", "x": x2, "y": y2}, ] case _: raise ValueError(f"Unknown method type: {method}") results.append(result) return json.dumps(results) class UipathBaseAgent: def __init__( self, platform="ubuntu", model="gpt-5-mini-2025-08-07", action_space="computer_13", observation_type="screenshot", client_password="password", ): self.platform = platform self.model = model self.action_space = action_space self.observation_type = observation_type self.client_password = client_password self.uipath_computer_use_model = UiPathComputerUseV1() self.thoughts = [] self.actions = [] self.observations = [] self.uipath_hist = [] def update_history(self, rsp, img_base64): self.uipath_hist.append( { "actions": rsp["step"]["actions"], "description": rsp["step"]["description"], "additional_parameters": rsp['step']['additional_parameters'], "image": img_base64, } ) def predict(self, instruction: str, obs: Dict, args, step_idx) -> List: if step_idx >= args.max_steps - 1: message = ( instruction + """You have reached the final step of the process. At this point, no further actions can be taken - it may therefore be impossible to complete the task successfully. Conclude by returning a finish action with success or failure, depending on what can be determined from the current state.""" ) else: message = instruction + "The sudo password is password, if needed." img_base64 = base64.b64encode(obs["screenshot"]).decode("utf-8") payload = { "previousSteps": self.uipath_hist, "userTask": message, "image": img_base64, "model_name": args.uipath_model_name, } rsp = asyncio.run( self.uipath_computer_use_model.predict_request( payload, args.uipath_model_name ) ) self.update_history(rsp, img_base64) uipath_actions = map_uipath_agent_actions_to_osworld(rsp["step"]["actions"]) try: actions = self.parse_actions(uipath_actions) self.thoughts.append(rsp) except ValueError as e: print("Failed to parse action from response", e) actions = None self.thoughts.append("") if len(actions) != 0: while actions and isinstance(actions[0], list): actions = [ action for multi_action in actions for action in multi_action ] return rsp["step"], actions def parse_actions(self, response: str, masks=None): if self.observation_type in ["screenshot"]: if self.action_space == "computer_13": actions = parse_actions_from_string(response) else: raise ValueError("Invalid action space: " + self.action_space) self.actions.append(actions) return actions else: raise ValueError("Invalid observation type: " + self.action_space) def reset(self, _logger=None): global logger logger = ( _logger if _logger is not None else logging.getLogger("desktopenv.agent") ) self.thoughts = [] self.actions = [] self.observations = [] self.uipath_hist = []