import os from enum import Enum from typing import Literal, List, Tuple import subprocess from fabric import Connection import time import gymnasium as gym from gymnasium import spaces import numpy as np import uuid from PIL import Image from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \ PythonMouseController from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \ PythonKeyboardController class Action(Enum): CLICK = 0 MOUSE_DOWN = 1 MOUSE_UP = 2 MOUSE_MOVE = 3 KEY = 4 KEY_DOWN = 5 KEY_UP = 6 TYPE = 7 VM_TYPE = Literal['ubuntu', 'windows'] class DesktopEnv(gym.Env): """DesktopEnv with OpenAI Gym interface.""" def __init__( self, path_to_vm: str, username: str, password: str = None, host: str = "192.168.7.128:5000", snapshot_path: str = "base", vm_os: VM_TYPE = "ubuntu" ): # The path to the vmx file of your vm self.path_to_vm = path_to_vm # username and password for your vm self.username = username self.password = password self.host = host self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory # Initialize emulator print("Initializing...") self._start_emulator() # set up controllers self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os) # Get the screen size self.screen_width, self.screen_height = self._get_screensize() # Define the action and observation space self.action_space = spaces.Dict({ "action_type": spaces.Discrete(len(Action)), "click_type": spaces.Discrete(len(MouseClick)), "x": spaces.Discrete(self.screen_width), "y": spaces.Discrete(self.screen_height), "key": spaces.MultiDiscrete([128] * 10), # max 10 characters, ASCII "text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII }) self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8) # Additional setup self.metadata = {'render.modes': ['rgb_array']} def _get_screensize(self): screenshot_path = self._get_obs() img = Image.open(screenshot_path) return img.size def _create_controllers(self, vm_os: VM_TYPE) -> Tuple[AbstractMouseController, AbstractKeyboardController]: if vm_os == "ubuntu": ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password}) mouse_controller = XDoToolMouseController(ssh_connection) keyboard_controller = XDoToolKeyboardController(ssh_connection) elif vm_os == "windows": mouse_controller = PythonMouseController(http_server=self.host) keyboard_controller = PythonKeyboardController(http_server=self.host) else: raise NotImplementedError(vm_os) return mouse_controller, keyboard_controller def _start_emulator(self): while True: try: output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT) output = output.decode() if self.path_to_vm.lstrip("~/") in output: print("VM is running.") break else: print("Starting VM...") self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm]) time.sleep(10) except subprocess.CalledProcessError as e: print(f"Error executing command: {e.output.decode().strip()}") def _execute_command(self, command: List[str]) -> None: result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True) if result.returncode != 0: raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m") def _save_state(self): self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path]) def _get_screenshot(self): random_uuid = str(uuid.uuid4()) os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True) image_path = os.path.join("tmp", random_uuid, "screenshot.png") if self.password: self._execute_command( ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm, image_path]) else: self._execute_command( ["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path]) return image_path def _get_obs(self): screenshot_image_path = self._get_screenshot() self._add_cursor(screenshot_image_path) return screenshot_image_path def _add_cursor(self, img_path: str): x, y = self.mouse_controller.get_mouse() cursor_image = Image.open("./desktop_env/assets/cursor.png") cursor_image = cursor_image.resize((int(cursor_image.width / 2), int(cursor_image.height / 2))) screenshot = Image.open(img_path) screenshot.paste(cursor_image, (x, y), cursor_image) screenshot.save(img_path) def reset(self): print("Resetting environment...") print("Reverting to snapshot to {}...".format(self.snapshot_path)) self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path]) time.sleep(5) print("Starting emulator...") self._start_emulator() print("Emulator started.") observation = self._get_obs() return observation def step(self, action): if isinstance(action, list): for a in action: observation, reward, done, info = self.step(a) return observation, reward, done, info # todo: handle the case when the action is not a single action try: action_type = Action(action['action_type']) except KeyError: done = True return self._get_obs(), 0, done, {} if action_type == Action.CLICK: click = MouseClick(action['click_type']) if click == MouseClick.LEFT: self.mouse_controller.left_click() elif click == MouseClick.MIDDLE: self.mouse_controller.middle_click() elif click == MouseClick.RIGHT: self.mouse_controller.right_click() elif click == MouseClick.WHEEL_UP: self.mouse_controller.scroll_up() elif click == MouseClick.WHEEL_DOWN: self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_DOWN: click = MouseClick(action['click_type']) if click == MouseClick.LEFT: self.mouse_controller.left_down() elif click == MouseClick.MIDDLE: self.mouse_controller.middle_down() elif click == MouseClick.RIGHT: self.mouse_controller.right_down() elif click == MouseClick.WHEEL_UP: self.mouse_controller.scroll_up() elif click == MouseClick.WHEEL_DOWN: self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_UP: click = MouseClick(action['click_type']) if click == MouseClick.LEFT: self.mouse_controller.left_up() elif click == MouseClick.MIDDLE: self.mouse_controller.middle_up() elif click == MouseClick.RIGHT: self.mouse_controller.right_up() elif click == MouseClick.WHEEL_UP: self.mouse_controller.scroll_up() elif click == MouseClick.WHEEL_DOWN: self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_MOVE: self.mouse_controller.mouse_move(x=action['x'], y=action['y']) elif action_type == Action.KEY: self.keyboard_controller.key(action['key']) elif action_type == Action.KEY_DOWN: self.keyboard_controller.key_down(action['key']) elif action_type == Action.KEY_UP: self.keyboard_controller.key_up(action['key']) elif action_type == Action.TYPE: for key in action['text']: if key == "\r" or key == "\n": self.keyboard_controller.key("enter") else: self.keyboard_controller.key(key) # sleep for 0.05 seconds with some random noise time.sleep(0.05 + np.random.normal(0, 0.01)) # Capture new state observation = self._get_obs() reward = 0 # Define reward calculation done = False # Define episode termination condition info = {} return observation, reward, done, info def render(self, mode='rgb_array'): if mode == 'rgb_array': return self._get_obs() else: raise ValueError('Unsupported render mode: {}'.format(mode)) def close(self): self._execute_command(["vmrun", "stop", self.path_to_vm])