diff --git a/README.md b/README.md index e3818b3..cf9e69d 100644 --- a/README.md +++ b/README.md @@ -22,9 +22,16 @@ 2. `rm -rf ~/screenshot.png` 7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool) +## Windows setup guide + +1. Copy and paste the file `windows_server/main.py` to the windows vm +2. Make sure `mouse` and `keyboard` are installed +3. Run the file `pythonw main.py` +4. `ipconfig /all` and find the ip address + ## Road map (Proposed) -- [ ] Explore VMWare, and whether it can be connected and control through mouse package +- [x] Explore VMWare, and whether it can be connected and control through mouse package - [x] Explore Windows and MacOS, whether it can be installed - MacOS is closed source and cannot be legally installed - Windows is available legally and can be installed diff --git a/desktop_env/controllers/keyboard.py b/desktop_env/controllers/keyboard.py index f818f01..7cb79f8 100644 --- a/desktop_env/controllers/keyboard.py +++ b/desktop_env/controllers/keyboard.py @@ -1,9 +1,10 @@ from abc import ABC, abstractmethod from fabric import Connection -from xdotool import XDoToolController +from .xdotool import XDoToolController +from .python import PythonController -class AbstractMouseController(ABC): +class AbstractKeyboardController(ABC): @abstractmethod def type(self, text: str): raise NotImplementedError @@ -12,7 +13,7 @@ class AbstractMouseController(ABC): def key(self, key: str): raise NotImplementedError -class XDoToolKeyboardController(AbstractMouseController, XDoToolController): +class XDoToolKeyboardController(AbstractKeyboardController, XDoToolController): def __init__(self, ssh_connection: Connection): super().__init__(ssh_connection=ssh_connection) @@ -22,16 +23,13 @@ class XDoToolKeyboardController(AbstractMouseController, XDoToolController): def key(self, key: str): self._execute_xdotool_command(f"key {key}") -class PythonKeyboardController(AbstractMouseController): - def __init__(self, ssh_connection: Connection): - self.ssh_connection = ssh_connection - - def _execute_python_command(self, command: list[str]) -> None: - result = self.ssh_connection.run(f"sudo python3 -c 'import keyboard; keyboard.{command}'", hide=True) - return result.stdout.strip() +class PythonKeyboardController(AbstractKeyboardController, PythonController): + def __init__(self, http_server: str): + super().__init__(http_server=http_server) + self.command = "python -c \"import keyboard; {command}\"" def type(self, text: str): - self._execute_python_command(f"write({text})") + self._execute_python_command(self.command.format(command=f"keyboard.write('{text}')")) def key(self, key: str): - self._execute_python_command(f"press_and_release({key})") \ No newline at end of file + self._execute_python_command(self.command.format(command=f"keyboard.press_and_release('{key}')")) \ No newline at end of file diff --git a/desktop_env/controllers/mouse.py b/desktop_env/controllers/mouse.py index f818f01..45961be 100644 --- a/desktop_env/controllers/mouse.py +++ b/desktop_env/controllers/mouse.py @@ -1,37 +1,144 @@ +from enum import Enum + from abc import ABC, abstractmethod from fabric import Connection -from xdotool import XDoToolController +from .xdotool import XDoToolController +from .python import PythonController +class MouseClick(Enum): + LEFT = 1 + MIDDLE = 2 + RIGHT = 3 + WHEEL_UP = 4 + WHEEL_DOWN = 5 class AbstractMouseController(ABC): @abstractmethod - def type(self, text: str): + def mouse_move(self, x: int, y: int): raise NotImplementedError @abstractmethod - def key(self, key: str): + def left_down(self): + raise NotImplementedError + + @abstractmethod + def left_up(self): + raise NotImplementedError + + @abstractmethod + def left_click(self): raise NotImplementedError -class XDoToolKeyboardController(AbstractMouseController, XDoToolController): + @abstractmethod + def middle_down(self): + raise NotImplementedError + + @abstractmethod + def middle_up(self): + raise NotImplementedError + + @abstractmethod + def middle_click(self): + raise NotImplementedError + + @abstractmethod + def right_down(self): + raise NotImplementedError + + @abstractmethod + def right_up(self): + raise NotImplementedError + + @abstractmethod + def right_click(self): + raise NotImplementedError + + @abstractmethod + def scroll_up(self): + raise NotImplementedError + + @abstractmethod + def scroll_down(self): + raise NotImplementedError + + +class XDoToolMouseController(AbstractMouseController, XDoToolController): def __init__(self, ssh_connection: Connection): super().__init__(ssh_connection=ssh_connection) - def type(self, text: str): - self._execute_xdotool_command(f"type {text}") + def mouse_move(self, x: int, y: int): + self._execute_xdotool_command(f"mousemove {x} {y}") - def key(self, key: str): - self._execute_xdotool_command(f"key {key}") - -class PythonKeyboardController(AbstractMouseController): - def __init__(self, ssh_connection: Connection): - self.ssh_connection = ssh_connection + def left_down(self): + self._execute_xdotool_command(f"mousedown 1") - def _execute_python_command(self, command: list[str]) -> None: - result = self.ssh_connection.run(f"sudo python3 -c 'import keyboard; keyboard.{command}'", hide=True) - return result.stdout.strip() + def left_up(self): + self._execute_xdotool_command(f"mouseup 1") - def type(self, text: str): - self._execute_python_command(f"write({text})") + def left_click(self): + self._execute_xdotool_command(f"click 1") - def key(self, key: str): - self._execute_python_command(f"press_and_release({key})") \ No newline at end of file + def middle_down(self): + self._execute_xdotool_command(f"mousedown 2") + + def middle_up(self): + self._execute_xdotool_command(f"mouseup 2") + + def middle_click(self): + self._execute_xdotool_command(f"click 2") + + def right_down(self): + self._execute_xdotool_command(f"mousedown 3") + + def right_up(self): + self._execute_xdotool_command(f"mouseup 3") + + def right_click(self): + self._execute_xdotool_command(f"click 3") + + def scroll_up(self): + self._execute_xdotool_command(f"click 4") + + def scroll_down(self): + self._execute_xdotool_command(f"click 5") + +class PythonMouseController(AbstractMouseController, PythonController): + def __init__(self, http_server: str): + super().__init__(http_server=http_server) + self.command = "python -c \"import mouse; {command}\"" + + def mouse_move(self, x: int, y: int): + self._execute_python_command(self.command.format(command=f"mouse.move({x}, {y})")) + + def left_down(self): + self._execute_python_command(self.command.format(command="mouse.press(button='left')")) + + def left_up(self): + self._execute_python_command(self.command.format(command="mouse.release(button='left')")) + + def left_click(self): + self._execute_python_command(self.command.format(command="mouse.click(button='left')")) + + def middle_down(self): + self._execute_python_command(self.command.format(command="mouse.press(button='middle')")) + + def middle_up(self): + self._execute_python_command(self.command.format(command="mouse.release(button='middle')")) + + def middle_click(self): + self._execute_python_command(self.command.format(command="mouse.click(button='middle')")) + + def right_down(self): + self._execute_python_command(self.command.format(command="mouse.press(button='right')")) + + def right_up(self): + self._execute_python_command(self.command.format(command="mouse.release(button='right')")) + + def right_click(self): + self._execute_python_command(self.command.format(command="mouse.click(button='right')")) + + def scroll_up(self): + self._execute_python_command(self.command.format(command="mouse.wheel(10)")) + + def scroll_down(self): + self._execute_python_command(self.command.format(command="mouse.wheel(-10)")) \ No newline at end of file diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py new file mode 100644 index 0000000..6b5e627 --- /dev/null +++ b/desktop_env/controllers/python.py @@ -0,0 +1,34 @@ +import requests +import json + +class PythonController: + def __init__(self, http_server: str): + self.http_server = http_server + + def _execute_python_command(self, command: str) -> None: + payload = json.dumps({ + "command": command + }) + headers = { + 'Content-Type': 'application/json' + } + + try: + response = requests.post(self.http_server + "/execute", headers=headers, data=payload) + if response.status_code == 200: + print("Command executed successfully:", response.text) + else: + print("Failed to execute command. Status code:", response.status_code) + except requests.exceptions.RequestException as e: + print("An error occurred while trying to execute the command:", e) + +# example usage +if __name__ == '__main__': + # replace with your actual server URL of the vm + server_url = "http://192.168.7.129:5000" + controller = PythonController(server_url) + + # example commands + python_command = "python -c \"import keyboard; keyboard.write('hello world')\"" + python_command = "python -c \"import mouse; mouse.move(100,100);mouse.right_click()\"" + controller._execute_python_command(python_command) diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index bcb67d3..a08c919 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -1,4 +1,5 @@ from enum import Enum +from typing import Literal import subprocess from fabric import Connection import time @@ -8,6 +9,9 @@ from gymnasium import spaces import numpy as np from PIL import Image +from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController +from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController + class Action(Enum): CLICK = 0 MOUSE_DOWN = 1 @@ -16,24 +20,18 @@ class Action(Enum): KEY = 4 TYPE = 5 -class MouseClick(Enum): - LEFT = 1 - MIDDLE = 2 - RIGHT = 3 - WHEEL_UP = 4 - WHEEL_DOWN = 5 +VM_TYPE = Literal['ubuntu', 'windows'] class DesktopEnv(gym.Env): """DesktopEnv with OpenAI Gym interface.""" def __init__(self, path_to_vm: str, username: str, password: str, - host: str, snapshot_path: str = "snapshot"): + host: str, snapshot_path: str = "snapshot", vm_os: VM_TYPE = "ubuntu"): self.path_to_vm = path_to_vm self.username = username self.password = password self.host = host self.snapshot_path = snapshot_path - self.ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": password}) self.screen_width = 800 self.screen_height = 800 @@ -54,6 +52,22 @@ class DesktopEnv(gym.Env): self._start_emulator() self._wait_for_emulator_load() + # set up controllers + self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os) + + def _create_controllers(self, vm_os: VM_TYPE) -> tuple[AbstractMouseController, AbstractKeyboardController]: + if vm_os == "ubuntu": + ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password}) + mouse_controller = XDoToolMouseController(ssh_connection) + keyboard_controller = XDoToolKeyboardController(ssh_connection) + elif vm_os == "windows": + mouse_controller = PythonMouseController(http_server=self.host) + keyboard_controller = PythonKeyboardController(http_server=self.host) + else: + raise NotImplementedError(vm_os) + + return mouse_controller, keyboard_controller + def _start_emulator(self): self._execute_command(["vmrun", "start", self.path_to_vm]) @@ -133,19 +147,49 @@ class DesktopEnv(gym.Env): def step(self, action): action_type = Action(action['action_type']) if action_type == Action.CLICK: - self._click(MouseClick(action['click_type'])) + click = MouseClick(action['click_type']) + if click == MouseClick.LEFT: + self.mouse_controller.left_click() + elif click == MouseClick.MIDDLE: + self.mouse_controller.middle_click() + elif click == MouseClick.RIGHT: + self.mouse_controller.right_click() + elif click == MouseClick.WHEEL_UP: + self.mouse_controller.scroll_up() + elif click == MouseClick.WHEEL_DOWN: + self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_DOWN: - self._mousedown(MouseClick(action['click_type'])) + click = MouseClick(action['click_type']) + if click == MouseClick.LEFT: + self.mouse_controller.left_down() + elif click == MouseClick.MIDDLE: + self.mouse_controller.middle_down() + elif click == MouseClick.RIGHT: + self.mouse_controller.right_down() + elif click == MouseClick.WHEEL_UP: + self.mouse_controller.scroll_up() + elif click == MouseClick.WHEEL_DOWN: + self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_UP: - self._mouseup(MouseClick(action['click_type'])) + click = MouseClick(action['click_type']) + if click == MouseClick.LEFT: + self.mouse_controller.left_up() + elif click == MouseClick.MIDDLE: + self.mouse_controller.middle_up() + elif click == MouseClick.RIGHT: + self.mouse_controller.right_up() + elif click == MouseClick.WHEEL_UP: + self.mouse_controller.scroll_up() + elif click == MouseClick.WHEEL_DOWN: + self.mouse_controller.scroll_down() elif action_type == Action.MOUSE_MOVE: - self._mouse_move(action['x'], action['y']) + self.mouse_controller.mouse_move(x = action['x'], y = action['y']) elif action_type == Action.KEY: key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string - self.key(key_sequence) + self.keyboard_controller.key(key_sequence) elif action_type == Action.TYPE: text = ''.join(map(chr, action['text'])) # Convert integer array to string - self._type(text) + self.keyboard_controller.type(text) # Capture new state observation = self._get_obs() diff --git a/desktop_env/windows_server/main.py b/desktop_env/windows_server/main.py new file mode 100644 index 0000000..56b7fc2 --- /dev/null +++ b/desktop_env/windows_server/main.py @@ -0,0 +1,29 @@ +from flask import Flask, request, jsonify +import subprocess + +app = Flask(__name__) + +@app.route('/execute', methods=['POST']) +def execute_command(): + data = request.json + # The 'command' key in the JSON request should contain the command to be executed. + command = data.get('command', '') + + # Execute the command without any safety checks. + try: + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + stdout, stderr = process.communicate() + + return jsonify({ + 'status': 'success', + 'output': stdout.decode(), + 'error': stderr.decode() + }) + except Exception as e: + return jsonify({ + 'status': 'error', + 'message': str(e) + }), 500 + +if __name__ == '__main__': + app.run(debug=True, host="0.0.0.0") diff --git a/main.py b/main.py index 0b20ce1..2cbcd18 100644 --- a/main.py +++ b/main.py @@ -38,10 +38,13 @@ def human_agent(): """ Runs the Gym environment with human input. """ - env = DesktopEnv(path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx", + env = DesktopEnv(path_to_vm="/home/yuri/vmware/Windows 10 x64/Windows 10 x64.vmx", + # path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx", username="user", password="password", - host="192.168.7.128") + # host="192.168.7.128", + host="http://192.168.7.129:5000", + vm_os="windows") observation = env.reset() done = False diff --git a/requirements.txt b/requirements.txt index e881093..e390586 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,3 +2,4 @@ numpy Pillow fabric gymnasium +requests diff --git a/screenshot.png b/screenshot.png new file mode 100644 index 0000000..0ea0c0f Binary files /dev/null and b/screenshot.png differ