Merge branch 'main' into zdy

This commit is contained in:
zdy023
2023-12-19 11:06:17 +08:00
111 changed files with 22918 additions and 497 deletions

Binary file not shown.

After

Width:  |  Height:  |  Size: 4.7 KiB

View File

@@ -1,35 +0,0 @@
from abc import ABC, abstractmethod
from fabric import Connection
from .xdotool import XDoToolController
from .python import PythonController
class AbstractKeyboardController(ABC):
@abstractmethod
def type(self, text: str):
raise NotImplementedError
@abstractmethod
def key(self, key: str):
raise NotImplementedError
class XDoToolKeyboardController(AbstractKeyboardController, XDoToolController):
def __init__(self, ssh_connection: Connection):
super().__init__(ssh_connection=ssh_connection)
def type(self, text: str):
self._execute_xdotool_command(f"type {text}")
def key(self, key: str):
self._execute_xdotool_command(f"key {key}")
class PythonKeyboardController(AbstractKeyboardController, PythonController):
def __init__(self, http_server: str):
super().__init__(http_server=http_server)
self.command = "python -c \"import keyboard; {command}\""
def type(self, text: str):
self._execute_python_command(self.command.format(command=f"keyboard.write('{text}')"))
def key(self, key: str):
self._execute_python_command(self.command.format(command=f"keyboard.press_and_release('{key}')"))

View File

@@ -1,144 +0,0 @@
from enum import Enum
from abc import ABC, abstractmethod
from fabric import Connection
from .xdotool import XDoToolController
from .python import PythonController
class MouseClick(Enum):
LEFT = 1
MIDDLE = 2
RIGHT = 3
WHEEL_UP = 4
WHEEL_DOWN = 5
class AbstractMouseController(ABC):
@abstractmethod
def mouse_move(self, x: int, y: int):
raise NotImplementedError
@abstractmethod
def left_down(self):
raise NotImplementedError
@abstractmethod
def left_up(self):
raise NotImplementedError
@abstractmethod
def left_click(self):
raise NotImplementedError
@abstractmethod
def middle_down(self):
raise NotImplementedError
@abstractmethod
def middle_up(self):
raise NotImplementedError
@abstractmethod
def middle_click(self):
raise NotImplementedError
@abstractmethod
def right_down(self):
raise NotImplementedError
@abstractmethod
def right_up(self):
raise NotImplementedError
@abstractmethod
def right_click(self):
raise NotImplementedError
@abstractmethod
def scroll_up(self):
raise NotImplementedError
@abstractmethod
def scroll_down(self):
raise NotImplementedError
class XDoToolMouseController(AbstractMouseController, XDoToolController):
def __init__(self, ssh_connection: Connection):
super().__init__(ssh_connection=ssh_connection)
def mouse_move(self, x: int, y: int):
self._execute_xdotool_command(f"mousemove {x} {y}")
def left_down(self):
self._execute_xdotool_command(f"mousedown 1")
def left_up(self):
self._execute_xdotool_command(f"mouseup 1")
def left_click(self):
self._execute_xdotool_command(f"click 1")
def middle_down(self):
self._execute_xdotool_command(f"mousedown 2")
def middle_up(self):
self._execute_xdotool_command(f"mouseup 2")
def middle_click(self):
self._execute_xdotool_command(f"click 2")
def right_down(self):
self._execute_xdotool_command(f"mousedown 3")
def right_up(self):
self._execute_xdotool_command(f"mouseup 3")
def right_click(self):
self._execute_xdotool_command(f"click 3")
def scroll_up(self):
self._execute_xdotool_command(f"click 4")
def scroll_down(self):
self._execute_xdotool_command(f"click 5")
class PythonMouseController(AbstractMouseController, PythonController):
def __init__(self, http_server: str):
super().__init__(http_server=http_server)
self.command = "python -c \"import mouse; {command}\""
def mouse_move(self, x: int, y: int):
self._execute_python_command(self.command.format(command=f"mouse.move({x}, {y})"))
def left_down(self):
self._execute_python_command(self.command.format(command="mouse.press(button='left')"))
def left_up(self):
self._execute_python_command(self.command.format(command="mouse.release(button='left')"))
def left_click(self):
self._execute_python_command(self.command.format(command="mouse.click(button='left')"))
def middle_down(self):
self._execute_python_command(self.command.format(command="mouse.press(button='middle')"))
def middle_up(self):
self._execute_python_command(self.command.format(command="mouse.release(button='middle')"))
def middle_click(self):
self._execute_python_command(self.command.format(command="mouse.click(button='middle')"))
def right_down(self):
self._execute_python_command(self.command.format(command="mouse.press(button='right')"))
def right_up(self):
self._execute_python_command(self.command.format(command="mouse.release(button='right')"))
def right_click(self):
self._execute_python_command(self.command.format(command="mouse.click(button='right')"))
def scroll_up(self):
self._execute_python_command(self.command.format(command="mouse.wheel(10)"))
def scroll_down(self):
self._execute_python_command(self.command.format(command="mouse.wheel(-10)"))

View File

@@ -1,34 +1,208 @@
import requests
import json
from typing import Any, Dict
import requests
from desktop_env.envs.actions import KEYBOARD_KEYS
class PythonController:
def __init__(self, http_server: str):
def __init__(self, http_server: str, pkgs_prefix: str = "python -c \"import pyautogui; {command}\""):
self.http_server = http_server
def _execute_python_command(self, command: str) -> None:
payload = json.dumps({
"command": command
})
self.pkgs_prefix = pkgs_prefix # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
def get_screenshot(self):
"""
Gets a screenshot from the server. With the cursor.
"""
response = requests.get(self.http_server + "/screenshot")
if response.status_code == 200:
return response.content
else:
print("Failed to get screenshot. Status code:", response.status_code)
return None
def get_file(self, file_path: str):
"""
Gets a file from the server.
"""
response = requests.post(self.http_server + "/file", data={"file_path": file_path})
if response.status_code == 200:
print("File downloaded successfully")
return response.content
else:
print("Failed to get file. Status code:", response.status_code)
return None
def execute_python_command(self, command: str) -> None:
"""
Executes a python command on the server.
It can be used to execute the pyautogui commands, or... any other python command. who knows?
"""
command = self.pkgs_prefix.format(command=command)
payload = json.dumps({"command": command})
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
if response.status_code == 200:
print("Command executed successfully:", response.text)
else:
print("Failed to execute command. Status code:", response.status_code)
return response.json()
except requests.exceptions.RequestException as e:
print("An error occurred while trying to execute the command:", e)
# example usage
if __name__ == '__main__':
# replace with your actual server URL of the vm
server_url = "http://192.168.7.129:5000"
controller = PythonController(server_url)
def execute_action(self, action: Dict[str, Any]):
"""
Executes an action on the server computer.
"""
# example commands
python_command = "python -c \"import keyboard; keyboard.write('hello world')\""
python_command = "python -c \"import mouse; mouse.move(100,100);mouse.right_click()\""
controller._execute_python_command(python_command)
action_type = action["action_type"]
parameters = action["parameters"] if "parameters" in action else {}
if action_type == "MOVE_TO":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.moveTo()")
elif "x" in parameters and "y" in parameters:
x = parameters["x"]
y = parameters["y"]
self.execute_python_command(f"pyautogui.moveTo({x}, {y})")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "CLICK":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.click()")
elif "button" in parameters and "x" in parameters and "y" in parameters:
button = parameters["button"]
x = parameters["x"]
y = parameters["y"]
if "num_clicks" in parameters:
num_clicks = parameters["num_clicks"]
self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
else:
self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
elif "button" in parameters and "x" not in parameters and "y" not in parameters:
button = parameters["button"]
if "num_clicks" in parameters:
num_clicks = parameters["num_clicks"]
self.execute_python_command(f"pyautogui.click(button='{button}', clicks={num_clicks})")
else:
self.execute_python_command(f"pyautogui.click(button='{button}')")
elif "button" not in parameters and "x" in parameters and "y" in parameters:
x = parameters["x"]
y = parameters["y"]
if "num_clicks" in parameters:
num_clicks = parameters["num_clicks"]
self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={num_clicks})")
else:
self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "MOUSE_DOWN":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.mouseDown()")
elif "button" in parameters:
button = parameters["button"]
self.execute_python_command(f"pyautogui.mouseDown(button='{button}')")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "MOUSE_UP":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.mouseUp()")
elif "button" in parameters:
button = parameters["button"]
self.execute_python_command(f"pyautogui.mouseUp(button='{button}')")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "RIGHT_CLICK":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.rightClick()")
elif "x" in parameters and "y" in parameters:
x = parameters["x"]
y = parameters["y"]
self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "DOUBLE_CLICK":
if parameters == {} or None:
self.execute_python_command(f"pyautogui.doubleClick()")
elif "x" in parameters and "y" in parameters:
x = parameters["x"]
y = parameters["y"]
self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "DRAG_TO":
if "x" in parameters and "y" in parameters:
x = parameters["x"]
y = parameters["y"]
self.execute_python_command(f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
elif action_type == "SCROLL":
# todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
if "dx" in parameters and "dy" in parameters:
dx = parameters["dx"]
dy = parameters["dy"]
self.execute_python_command(f"pyautogui.hscroll({dx})")
self.execute_python_command(f"pyautogui.vscroll({dy})")
elif "dx" in parameters and "dy" not in parameters:
dx = parameters["dx"]
self.execute_python_command(f"pyautogui.hscroll({dx})")
elif "dx" not in parameters and "dy" in parameters:
dy = parameters["dy"]
self.execute_python_command(f"pyautogui.vscroll({dy})")
else:
raise Exception(f"Unknown parameters: {parameters}")
elif action_type == "TYPING":
if "text" not in parameters:
raise Exception(f"Unknown parameters: {parameters}")
text = parameters["text"]
self.execute_python_command(f"pyautogui.typewrite('{text}')")
elif action_type == "PRESS":
if "key" not in parameters:
raise Exception(f"Unknown parameters: {parameters}")
key = parameters["key"]
if key.lower() not in KEYBOARD_KEYS:
raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
self.execute_python_command(f"pyautogui.press('{key}')")
elif action_type == "KEY_DOWN":
if "key" not in parameters:
raise Exception(f"Unknown parameters: {parameters}")
key = parameters["key"]
if key.lower() not in KEYBOARD_KEYS:
raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
self.execute_python_command(f"pyautogui.keyDown('{key}')")
elif action_type == "KEY_UP":
if "key" not in parameters:
raise Exception(f"Unknown parameters: {parameters}")
key = parameters["key"]
if key.lower() not in KEYBOARD_KEYS:
raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
self.execute_python_command(f"pyautogui.keyUp('{key}')")
elif action_type == "HOTKEY":
if "keys" not in parameters:
raise Exception(f"Unknown parameters: {parameters}")
keys = parameters["keys"]
if not isinstance(keys, list):
raise Exception(f"Keys must be a list of keys")
for key in keys:
if key.lower() not in KEYBOARD_KEYS:
raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
keys_para_rep = "', '".join(keys)
self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
else:
raise Exception(f"Unknown action type: {action_type}")

View File

@@ -0,0 +1,96 @@
import requests
import json
class SetupController:
def __init__(self, http_server: str):
self.http_server = http_server + "/setup"
def setup(self, config):
"""
Setup Config:
{
download: list[tuple[string]], # a list of tuples of url of file to download and the save path
...
}
"""
self._download_setup(config)
self._change_wallpaper(config)
# self._tidy_desktop(config) todo: implement this
self._open_setup(config)
# can add other setup steps
def _download_setup(self, config):
if not config:
return
if not 'download' in config:
return
for url, path in config['download']:
if not url or not path:
raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
payload = json.dumps({"url": url, "path": path})
headers = {
'Content-Type': 'application/json'
}
# send request to server to download file
try:
response = requests.post(self.http_server + "/download_file", headers=headers, data=payload)
if response.status_code == 200:
print("Command executed successfully:", response.text)
else:
print("Failed to download file. Status code:", response.text)
except requests.exceptions.RequestException as e:
print("An error occurred while trying to send the request:", e)
def _change_wallpaper(self, config):
if not config:
return
if not 'wallpaper' in config:
return
path = config['wallpaper']
if not path:
raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
payload = json.dumps({"path": path})
headers = {
'Content-Type': 'application/json'
}
# send request to server to change wallpaper
try:
response = requests.post(self.http_server + "/change_wallpaper", headers=headers, data=payload)
if response.status_code == 200:
print("Command executed successfully:", response.text)
else:
print("Failed to change wallpaper. Status code:", response.text)
except requests.exceptions.RequestException as e:
print("An error occurred while trying to send the request:", e)
def _tidy_desktop(self, config):
raise NotImplementedError
def _open_setup(self, config):
if not config:
return
if not 'open' in config:
return
for path in config['open']:
if not path:
raise Exception(f"Setup Open - Invalid path ({path}).")
payload = json.dumps({"path": path})
headers = {
'Content-Type': 'application/json'
}
# send request to server to open file
try:
response = requests.post(self.http_server + "/open_file", headers=headers, data=payload)
if response.status_code == 200:
print("Command executed successfully:", response.text)
else:
print("Failed to open file. Status code:", response.text)
except requests.exceptions.RequestException as e:
print("An error occurred while trying to send the request:", e)

View File

@@ -1,9 +0,0 @@
from fabric import Connection
class XDoToolController:
def __init__(self, ssh_connection: Connection):
self.ssh_connection = ssh_connection
def _execute_xdotool_command(self, command: list[str]) -> None:
result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
return result.stdout.strip()

190
desktop_env/envs/actions.py Normal file
View File

@@ -0,0 +1,190 @@
X_MAX = 1920 # TODO: get the screen resolution
Y_MAX = 1080
KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
ACTION_SPACE = [
{
"action_type": "MOVE_TO",
"note": "move the cursor to the specified position",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": False,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": False,
}
}
},
{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
},
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
},
"num_clicks": {
"type": int,
"range": [1, 2, 3],
"optional": True,
},
}
},
{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
}
}
},
{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
"parameters": {
"button": {
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
}
}
},
{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
}
}
},
{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": True,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": True,
}
}
},
{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
"parameters": {
"x": {
"type": float,
"range": [0, X_MAX],
"optional": False,
},
"y": {
"type": float,
"range": [0, Y_MAX],
"optional": False,
}
}
},
{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
"parameters": {
"dx": {
"type": int,
"range": None,
"optional": False,
},
"dy": {
"type": int,
"range": None,
"optional": False,
}
}
},
{
"action_type": "TYPING",
"note": "type the specified text",
"parameters": {
"text": {
"type": str,
"range": None,
"optional": False,
}
}
},
{
"action_type": "PRESS",
"note": "press the specified key and release it",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "KEY_DOWN",
"note": "press the specified key",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "KEY_UP",
"note": "release the specified key",
"parameters": {
"key": {
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
}
}
},
{
"action_type": "HOTKEY",
"note": "press the specified key combination",
"parameters": {
"keys": {
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
}
}
}
]

View File

@@ -1,203 +1,186 @@
from enum import Enum
from typing import Literal
from __future__ import annotations
import os
import subprocess
from fabric import Connection
import time
import uuid
import platform
from typing import List
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from PIL import Image
import requests
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
from desktop_env.controllers.python import PythonController
from desktop_env.controllers.setup import SetupController
from desktop_env.evaluators import eval_funcs
class Action(Enum):
CLICK = 0
MOUSE_DOWN = 1
MOUSE_UP = 2
MOUSE_MOVE = 3
KEY = 4
TYPE = 5
VM_TYPE = Literal['ubuntu', 'windows']
def _execute_command(command: List[str]) -> None:
if command[:4] == ["vmrun", "-T", "ws", "start"]:
p = subprocess.Popen(command)
p.wait()
else:
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
if result.returncode != 0:
raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
return result.stdout
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
def __init__(self, path_to_vm: str, username: str, password: str,
host: str, snapshot_path: str = "snapshot", vm_os: VM_TYPE = "ubuntu"):
def __init__(
self,
path_to_vm: str,
snapshot_path: str = "base",
instruction: str = None,
config: dict = None,
evaluator: dict = None,
action_space: str = "computer_13",
):
# Initialize environment variables
self.path_to_vm = path_to_vm
self.username = username
self.password = password
self.host = host
self.snapshot_path = snapshot_path
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
self.action_space = spaces.Dict({
"action_type": spaces.Discrete(len(Action)),
"click_type": spaces.Discrete(len(MouseClick)),
"x": spaces.Discrete(self.screen_width),
"y": spaces.Discrete(self.screen_height),
"key": spaces.MultiDiscrete([128] * 10), # max 10 characters, ASCII
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
})
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
# Initialize emulator and controller
print("Initializing...")
self._start_emulator()
self._wait_for_emulator_load()
self.host = f"http://{self._get_vm_ip()}:5000"
self.controller = PythonController(http_server=self.host)
self.setup_controller = SetupController(http_server=self.host)
self.instruction = instruction
self.config = config
self.evaluator = evaluator
# set up controllers
self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os)
def _create_controllers(self, vm_os: VM_TYPE) -> tuple[AbstractMouseController, AbstractKeyboardController]:
if vm_os == "ubuntu":
ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password})
mouse_controller = XDoToolMouseController(ssh_connection)
keyboard_controller = XDoToolKeyboardController(ssh_connection)
elif vm_os == "windows":
mouse_controller = PythonMouseController(http_server=self.host)
keyboard_controller = PythonKeyboardController(http_server=self.host)
else:
raise NotImplementedError(vm_os)
return mouse_controller, keyboard_controller
# mode: human or machine
assert action_space in ["computer_13", "pyautogui"]
self.action_space = action_space
# todo: define the action space and the observation space as gym did, or extend theirs
def _start_emulator(self):
self._execute_command(["vmrun", "start", self.path_to_vm])
def _wait_for_emulator_load(self):
while True:
try:
output = subprocess.check_output("vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
output = output.decode()
if self.path_to_vm.lstrip("~/") in output:
print("VM is running.")
return
break
else:
print("Waiting for VM to start...")
time.sleep(5)
print("Starting VM...")
_execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
time.sleep(3)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
return
def _execute_command(self, command: list[str]) -> None:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
print(stderr.decode())
return None
else:
return stdout.decode()
def _execute_xdotool_command(self, command: list[str]) -> None:
result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
return result.stdout.strip()
def _get_vm_ip(self):
max_retries = 10
print("Getting IP Address...")
for _ in range(max_retries):
try:
output = _execute_command(["vmrun", "-T", "ws", "getGuestIPAddress", self.path_to_vm]).strip()
print(f"IP address: {output}")
return output
except:
time.sleep(5)
print("Retrying...")
raise Exception("Failed to get VM IP address!")
def _save_state(self):
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
def _click(self, click: MouseClick):
self._execute_xdotool_command(f"click {click.value}")
def _mousedown(self, click: MouseClick):
self._execute_xdotool_command(f"mousedown {click.value}")
def _mouseup(self, click: MouseClick):
self._execute_xdotool_command(f"mouseup {click.value}")
def _mouse_move(self, x: int, y: int):
self._execute_xdotool_command(f"mousemove {x} {y}")
def _key(self, key: str):
self._execute_xdotool_command(f"key {key}")
def _type(self, text: str):
self._execute_xdotool_command(f"type {text}")
_execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
def _get_screenshot(self):
image_path = "./screenshot.png"
self._execute_command(["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm, image_path])
random_uuid = str(uuid.uuid4())
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
image_path = os.path.join("tmp", random_uuid, "screenshot.png")
# Get the screenshot and save to the image_path
screenshot = self.controller.get_screenshot()
with open(image_path, "wb") as f:
f.write(screenshot)
return image_path
def _get_obs(self):
print("OBS 1")
screenshot_image_path = self._get_screenshot()
print("OBS 2")
with Image.open(screenshot_image_path) as img:
return np.array(img)
return screenshot_image_path
def reset(self):
input("Reset #1 PE")
#self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
input("Revert to snapshot #2 PE")
def reset(self, seed=None, options=None):
print("Resetting environment...")
print("Reverting to snapshot to {}...".format(self.snapshot_path))
_execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
time.sleep(5)
print("Starting emulator...")
self._start_emulator()
input("Started emulator #3 PE")
self._wait_for_emulator_load()
observation = self._get_obs()
print("Emulator started.")
print("Setting up environment...")
self.setup_controller.setup(self.config)
time.sleep(5)
print("Environment setup complete.")
observation = self._get_obs()
return observation
def step(self, action):
action_type = Action(action['action_type'])
if action_type == Action.CLICK:
click = MouseClick(action['click_type'])
if click == MouseClick.LEFT:
self.mouse_controller.left_click()
elif click == MouseClick.MIDDLE:
self.mouse_controller.middle_click()
elif click == MouseClick.RIGHT:
self.mouse_controller.right_click()
elif click == MouseClick.WHEEL_UP:
self.mouse_controller.scroll_up()
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_DOWN:
click = MouseClick(action['click_type'])
if click == MouseClick.LEFT:
self.mouse_controller.left_down()
elif click == MouseClick.MIDDLE:
self.mouse_controller.middle_down()
elif click == MouseClick.RIGHT:
self.mouse_controller.right_down()
elif click == MouseClick.WHEEL_UP:
self.mouse_controller.scroll_up()
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_UP:
click = MouseClick(action['click_type'])
if click == MouseClick.LEFT:
self.mouse_controller.left_up()
elif click == MouseClick.MIDDLE:
self.mouse_controller.middle_up()
elif click == MouseClick.RIGHT:
self.mouse_controller.right_up()
elif click == MouseClick.WHEEL_UP:
self.mouse_controller.scroll_up()
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_MOVE:
self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
elif action_type == Action.KEY:
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
self.keyboard_controller.key(key_sequence)
elif action_type == Action.TYPE:
text = ''.join(map(chr, action['text'])) # Convert integer array to string
self.keyboard_controller.type(text)
def step(self, action, pause=0.5):
# fixme: add reminding logic here, decide if the action is valid for the current action_space
if self.action_space == "computer_13":
# the set of all possible actions defined in the action representation
self.controller.execute_action(action)
elif self.action_space == "pyautogui":
# the set of all possible python commands insides `pyautogui`
self.controller.execute_python_command(action)
# Capture new state
observation = self._get_obs()
reward = 0 # Define reward calculation
done = False # Define episode termination condition
# todo: maybe for the better here we need to add a logic to wait until the rendering is done
time.sleep(pause)
observation = {
"screenshot": self._get_obs(),
"instruction": self.instruction
}
reward = 0 # todo: Define reward calculation for each example
done = False # todo: Define episode termination condition for each example
info = {}
return observation, reward, done, info
def evaluate(self):
"""
Evaluate whether the task is successfully completed.
"""
def copy_file_to_local(_file_info):
random_uuid = str(uuid.uuid4())
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
_path = os.path.join("tmp", random_uuid, "tmp.xlsx")
if _file_info["type"] == "cloud_file":
url = _file_info["path"]
response = requests.get(url, stream=True)
response.raise_for_status()
with open(_path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
elif _file_info["type"] == "vm_file":
# fixme: stream this part maybe as well
file = self.controller.get_file(_file_info["path"])
with open(_path, "wb") as f:
f.write(file)
else:
raise NotImplementedError
return _path
# todo: make this more flexible by refactoring
eval_func = eval_funcs[self.evaluator["func"]]
eval_func_vars = {}
for var_name, file_info in self.evaluator["paths"].items():
path = copy_file_to_local(file_info)
eval_func_vars[var_name] = path
return eval_func(**eval_func_vars)
def render(self, mode='rgb_array'):
if mode == 'rgb_array':
return self._get_obs()
@@ -205,4 +188,4 @@ class DesktopEnv(gym.Env):
raise ValueError('Unsupported render mode: {}'.format(mode))
def close(self):
self._execute_command(["vmrun", "stop", self.path_to_vm])
_execute_command(["vmrun", "stop", self.path_to_vm])

View File

@@ -0,0 +1,5 @@
from .table import compare_table
eval_funcs = {
"compare_table(expected, actual)": compare_table
}

View File

View File

@@ -0,0 +1,14 @@
def compare_table(expected, actual):
import pandas as pd
df1 = pd.read_excel(expected)
df2 = pd.read_excel(actual)
# Compare the DataFrames
return 1 if df1.equals(df2) else 0
if __name__ == '__main__':
path1 = ""
path2 = ""
print(compare_table(path1, path2))

190
desktop_env/server/main.py Normal file
View File

@@ -0,0 +1,190 @@
import os
from pathlib import Path
import platform
import subprocess
import requests
import Xlib.display
import pyautogui
from PIL import ImageGrab, Image
from flask import Flask, request, jsonify, send_file
app = Flask(__name__)
pyautogui.PAUSE = 0
pyautogui.DARWIN_CATCH_UP_TIME = 0
@app.route('/execute', methods=['POST'])
def execute_command():
data = request.json
# The 'command' key in the JSON request should contain the command to be executed.
command = data.get('command', '')
# Execute the command without any safety checks.
try:
result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
return jsonify({
'status': 'success',
'output': result.stdout,
'error': result.stderr
})
except Exception as e:
return jsonify({
'status': 'error',
'message': str(e)
}), 500
@app.route('/screenshot', methods=['GET'])
def capture_screen_with_cursor():
# fixme: when running on virtual machines, the cursor is not captured, don't know why
file_path = os.path.join("screenshots", "screenshot.png")
user_platform = platform.system()
# Ensure the screenshots directory exists
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# fixme: This is a temporary fix for the cursor not being captured on Windows and Linux
if user_platform == "Windows" or user_platform == "Linux":
def _download_image(url, path):
response = requests.get(url)
with open(path, 'wb') as file:
file.write(response.content)
cursor_path = os.path.join("screenshots", "cursor.png")
if not os.path.exists(cursor_path):
cursor_url = "https://vip.helloimg.com/images/2023/12/02/oQPzmt.png"
_download_image(cursor_url, cursor_path)
screenshot = pyautogui.screenshot()
cursor_x, cursor_y = pyautogui.position()
cursor = Image.open(cursor_path)
# make the cursor smaller
cursor = cursor.resize((int(cursor.width / 1.5), int(cursor.height / 1.5)))
screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
screenshot.save(file_path)
# elif user_platform == "Linux":
# # Use xlib to prevent scrot dependency for Linux
# screen = Xlib.display.Display().screen()
# size = screen.width_in_pixels, screen.height_in_pixels
# screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
# screenshot.save(file_path)
elif user_platform == "Darwin": # (Mac OS)
# Use the screencapture utility to capture the screen with the cursor
subprocess.run(["screencapture", "-C", file_path])
else:
print(f"The platform you're using ({user_platform}) is not currently supported")
return send_file(file_path, mimetype='image/png')
@app.route('/file', methods=['POST'])
def get_file():
# Retrieve filename from the POST request
if 'file_path' in request.form:
file_path = request.form['file_path']
else:
return jsonify({"error": "file_path is required"}), 400
try:
# Check if the file exists and send it to the user
return send_file(file_path, as_attachment=True)
except FileNotFoundError:
# If the file is not found, return a 404 error
return jsonify({"error": "File not found"}), 404
@app.route('/platform', methods=['GET'])
def get_platform():
return platform.system()
@app.route('/cursor_position', methods=['GET'])
def get_cursor_position():
return pyautogui.position().x, pyautogui.position().y
@app.route("/setup/change_wallpaper", methods=['POST'])
def change_wallpaper():
data = request.json
path = data.get('path', None)
if not path:
return "Path not supplied!", 400
path = Path(path)
if not path.exists():
return f"File not found: {path}", 404
try:
user_platform = platform.system()
if user_platform == "Windows":
import ctypes
ctypes.windll.user32.SystemParametersInfoW(20, 0, str(path), 3)
elif user_platform == "Linux":
import subprocess
subprocess.run(["gsettings", "set", "org.gnome.desktop.background", "picture-uri", f"file://{path}"])
elif user_platform == "Darwin": # (Mac OS)
import subprocess
subprocess.run(
["osascript", "-e", f'tell application "Finder" to set desktop picture to POSIX file "{path}"'])
return "Wallpaper changed successfully"
except Exception as e:
return f"Failed to change wallpaper. Error: {e}", 500
@app.route("/setup/download_file", methods=['POST'])
def download_file():
data = request.json
url = data.get('url', None)
path = data.get('path', None)
if not url or not path:
return "Path or URL not supplied!", 400
path = Path(path)
path.parent.mkdir(parents=True, exist_ok=True)
max_retries = 3
for i in range(max_retries):
try:
response = requests.get(url, stream=True)
response.raise_for_status()
with open(path, 'wb') as f:
for chunk in response.iter_content(chunk_size=8192):
if chunk:
f.write(chunk)
return "File downloaded successfully"
except requests.RequestException as e:
print(f"Failed to download {url}. Retrying... ({max_retries - i - 1} attempts left)")
return f"Failed to download {url}. No retries left. Error: {e}", 500
@app.route("/setup/open_file", methods=['POST'])
def open_file():
data = request.json
path = data.get('path', None)
if not path:
return "Path not supplied!", 400
path = Path(path)
if not path.exists():
return f"File not found: {path}", 404
try:
os.startfile(path)
return "File opened successfully"
except Exception as e:
return f"Failed to open {path}. Error: {e}", 500
if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0")

View File

@@ -0,0 +1,6 @@
python3-xlib==0.15
PyAutoGUI==0.9.54
Pillow==10.1.0
git+https://github.com/moses-palmer/pynput.git@refs/pull/541/head # to make sure that it works on Apple Silicon
requests
flask

View File

@@ -1,29 +0,0 @@
from flask import Flask, request, jsonify
import subprocess
app = Flask(__name__)
@app.route('/execute', methods=['POST'])
def execute_command():
data = request.json
# The 'command' key in the JSON request should contain the command to be executed.
command = data.get('command', '')
# Execute the command without any safety checks.
try:
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
return jsonify({
'status': 'success',
'output': stdout.decode(),
'error': stderr.decode()
})
except Exception as e:
return jsonify({
'status': 'error',
'message': str(e)
}), 500
if __name__ == '__main__':
app.run(debug=True, host="0.0.0.0")