diff --git a/README.md b/README.md index e699a2e..96f1da2 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# DesktopEnv: A Learning Environment for Human-like Computer Task Mastery +# DesktopEnv: An Environment towards Human-like Computer Task Mastery ## Setup guide diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index f2021c0..431a0df 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -1,6 +1,7 @@ import json - +from typing import Any, Dict import requests +from desktop_env.envs.actions import KEYBOARD_KEYS class PythonController: @@ -40,9 +41,144 @@ class PythonController: except requests.exceptions.RequestException as e: print("An error occurred while trying to execute the command:", e) - def execute_action(self, action: str): + def execute_action(self, action: Dict[str, Any]): """ Executes an action on the server computer. """ - raise NotImplementedError \ No newline at end of file + action_type = action["action_type"] + parameters = action["parameters"] if "parameters" in action else {} + + if action_type == "MOVE_TO": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.moveTo()") + elif "x" in parameters and "y" in parameters: + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.moveTo({x}, {y})") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "CLICK": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.click()") + elif "button" in parameters and "x" in parameters and "y" in parameters: + button = parameters["button"] + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})") + elif "button" in parameters and "x" not in parameters and "y" not in parameters: + button = parameters["button"] + self.execute_python_command(f"pyautogui.click(button='{button}')") + elif "button" not in parameters and "x" in parameters and "y" in parameters: + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.click(x={x}, y={y})") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "MOUSE_DOWN": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.mouseDown()") + elif "button" in parameters: + button = parameters["button"] + self.execute_python_command(f"pyautogui.mouseDown(button='{button}')") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "MOUSE_UP": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.mouseUp()") + elif "button" in parameters: + button = parameters["button"] + self.execute_python_command(f"pyautogui.mouseUp(button='{button}')") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "RIGHT_CLICK": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.rightClick()") + elif "x" in parameters and "y" in parameters: + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "DOUBLE_CLICK": + if parameters == {} or None: + self.execute_python_command(f"pyautogui.doubleClick()") + elif "x" in parameters and "y" in parameters: + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "DRAG_TO": + if "x" in parameters and "y" in parameters: + x = parameters["x"] + y = parameters["y"] + self.execute_python_command(f"pyautogui.dragTo({x}, {y}, button='left')") + + elif action_type == "SCROLL": + # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out + if "dx" in parameters and "dy" in parameters: + dx = parameters["dx"] + dy = parameters["dy"] + self.execute_python_command(f"pyautogui.hscroll({dx})") + self.execute_python_command(f"pyautogui.vscroll({dy})") + elif "dx" in parameters and "dy" not in parameters: + dx = parameters["dx"] + self.execute_python_command(f"pyautogui.hscroll({dx})") + elif "dx" not in parameters and "dy" in parameters: + dy = parameters["dy"] + self.execute_python_command(f"pyautogui.vscroll({dy})") + else: + raise Exception(f"Unknown parameters: {parameters}") + + elif action_type == "TYPING": + if "text" not in parameters: + raise Exception(f"Unknown parameters: {parameters}") + text = parameters["text"] + self.execute_python_command(f"pyautogui.typewrite('{text}')") + + elif action_type == "PRESS": + if "key" not in parameters: + raise Exception(f"Unknown parameters: {parameters}") + key = parameters["key"] + if key.lower() not in KEYBOARD_KEYS: + raise Exception(f"Key must be one of {KEYBOARD_KEYS}") + self.execute_python_command(f"pyautogui.press('{key}')") + + elif action_type == "KEY_DOWN": + if "key" not in parameters: + raise Exception(f"Unknown parameters: {parameters}") + key = parameters["key"] + if key.lower() not in KEYBOARD_KEYS: + raise Exception(f"Key must be one of {KEYBOARD_KEYS}") + self.execute_python_command(f"pyautogui.keyDown('{key}')") + + elif action_type == "KEY_UP": + if "key" not in parameters: + raise Exception(f"Unknown parameters: {parameters}") + key = parameters["key"] + if key.lower() not in KEYBOARD_KEYS: + raise Exception(f"Key must be one of {KEYBOARD_KEYS}") + self.execute_python_command(f"pyautogui.keyUp('{key}')") + + elif action_type == "HOTKEY": + if "keys" not in parameters: + raise Exception(f"Unknown parameters: {parameters}") + keys = parameters["keys"] + if not isinstance(keys, list): + raise Exception(f"Keys must be a list of keys") + for key in keys: + if key.lower() not in KEYBOARD_KEYS: + raise Exception(f"Key must be one of {KEYBOARD_KEYS}") + + keys_para_rep = "', '".join(keys) + self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')") + + else: + raise Exception(f"Unknown action type: {action_type}") diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index 16b73cf..b4347cb 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -25,6 +25,7 @@ class DesktopEnv(gym.Env): path_to_vm: str, host: str = "192.168.7.128:5000", snapshot_path: str = "base", + action_space: str = "pyautogui", ): # Initialize environment variables self.path_to_vm = path_to_vm @@ -35,7 +36,11 @@ class DesktopEnv(gym.Env): print("Initializing...") self._start_emulator() self.controller = PythonController(http_server=self.host) - # todo: define the action space and the observation space as gym did + + # mode: human or machine + assert action_space in ["computer_13", "pyautogui"] + self.action_space = action_space + # todo: define the action space and the observation space as gym did, or extend theirs def _start_emulator(self): while True: @@ -86,9 +91,12 @@ class DesktopEnv(gym.Env): return observation def step(self, action, pause=0.5): - # todo: support both the action space of our-designed space and the executable code space in pyautogui - # Our action space is the set of all possible python commands insides `pyautogui` - self.controller.execute_python_command(action) + if self.action_space == "computer_13": + # the set of all possible actions defined in the action representation + self.controller.execute_action(action) + elif self.action_space == "pyautogui": + # the set of all possible python commands insides `pyautogui` + self.controller.execute_python_command(action) # todo: maybe for the better here we need to add a logic to wait until the rendering is done time.sleep(pause)