Fix conflicts

2023-12-16 21:32:43 +08:00
parent 7ab3799d30 fe2e5332a7
commit 30064ff816
43 changed files with 4124 additions and 631 deletions
--- a/README.md
+++ b/README.md
@@ -1,7 +1,8 @@
-# DesktopEnv: A Learning Environment for Human-like Computer Task Mastery
+# DesktopEnv: An Environment towards Human-like Computer Task Mastery
 ## Setup guide
 ### For members of the team
 1. Download OS image
   1. Download kubuntu from <https://kubuntu.org/getkubuntu/>
   2. Download ubuntu from <https://ubuntu.com/download/desktop>
@@ -22,7 +23,8 @@
   2. `rm -rf ~/screenshot.png`
 7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
-
+### For users of the environment
 todo
 ## Road map (Proposed)
--- a/SERVER_SETUP.md
+++ b/SERVER_SETUP.md
@@ -1,23 +1,6 @@
 # Server Setup Guide
- [Linux](#linux)
+1. Copy and paste the file `server/main.py` to the windows vm
- [Windows](#windows)
+2. Install the requirements `pip install -r requirements.txt`
 ## Linux
 <https://averagelinuxuser.com/ssh-into-virtualbox/>
 1. `sudo apt install openssh-server`
 2. `sudo systemctl enable ssh --now`
 3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
 4. `ip a` - find ip address
 5. ssh username@<ip_address>
 6. On host, run `ssh-copy-id <username>@<ip_address>`
 ## Windows
 1. Copy and paste the file `windows_server/main.py` to the windows vm
 2. Make sure `mouse` and `keyboard` are installed
 3. Run the file `python main.py`
 4. `ipconfig /all` and find the ip address
--- a/desktop_env/assets/cursor.png
+++ b/desktop_env/assets/cursor.png
--- a/desktop_env/controllers/keyboard.py
+++ b/desktop_env/controllers/keyboard.py
@@ -1,56 +0,0 @@
 from abc import ABC, abstractmethod
 from fabric import Connection
 from .xdotool import XDoToolController
 from .python import PythonController
 class AbstractKeyboardController(ABC):
    @abstractmethod
    def type(self, text: str):
        raise NotImplementedError
    @abstractmethod
    def key(self, key: str):
        raise NotImplementedError
    @abstractmethod
    def key_down(self, key: str):
        raise NotImplementedError
    @abstractmethod
    def key_up(self, key: str):
        raise NotImplementedError
 class XDoToolKeyboardController(AbstractKeyboardController, XDoToolController):
    def __init__(self, ssh_connection: Connection):
        super().__init__(ssh_connection=ssh_connection)
    def type(self, text: str):
        self._execute_xdotool_command(f"type {text}")
    def key(self, key: str):
        self._execute_xdotool_command(f"key {key}")
    def key_down(self, key: str):
        self._execute_xdotool_command(f"keydown {key}")
    def key_up(self, key: str):
        self._execute_xdotool_command(f"keyup {key}")
 class PythonKeyboardController(AbstractKeyboardController, PythonController):
    def __init__(self, http_server: str):
        super().__init__(http_server=http_server)
        self.command = "python -c \"import keyboard; {command}\""
    def type(self, text: str):
        self._execute_python_command(self.command.format(command=f"keyboard.write('{text}')"))
    def key(self, key: str):
        self._execute_python_command(self.command.format(command=f"keyboard.press_and_release('{key}')"))
    def key_down(self, key: str):
        self._execute_python_command(self.command.format(command=f"keyboard.press('{key}')"))
    def key_up(self, key: str):
        self._execute_python_command(self.command.format(command=f"keyboard.release('{key}')"))
--- a/desktop_env/controllers/mouse.py
+++ b/desktop_env/controllers/mouse.py
@@ -1,144 +0,0 @@
 from enum import Enum
 from abc import ABC, abstractmethod
 from fabric import Connection
 from .xdotool import XDoToolController
 from .python import PythonController
 class MouseClick(Enum):
    LEFT = 1
    MIDDLE = 2
    RIGHT = 3
    WHEEL_UP = 4
    WHEEL_DOWN = 5
 class AbstractMouseController(ABC):
    @abstractmethod
    def mouse_move(self, x: int, y: int):
        raise NotImplementedError
    @abstractmethod
    def left_down(self):
        raise NotImplementedError
    @abstractmethod
    def left_up(self):
        raise NotImplementedError
    @abstractmethod
    def left_click(self):
        raise NotImplementedError
    @abstractmethod
    def middle_down(self):
        raise NotImplementedError
    @abstractmethod
    def middle_up(self):
        raise NotImplementedError
    @abstractmethod
    def middle_click(self):
        raise NotImplementedError
    @abstractmethod
    def right_down(self):
        raise NotImplementedError
    @abstractmethod
    def right_up(self):
        raise NotImplementedError
    @abstractmethod
    def right_click(self):
        raise NotImplementedError
    @abstractmethod
    def scroll_up(self):
        raise NotImplementedError
    @abstractmethod
    def scroll_down(self):
        raise NotImplementedError
 class XDoToolMouseController(AbstractMouseController, XDoToolController):
    def __init__(self, ssh_connection: Connection):
        super().__init__(ssh_connection=ssh_connection)
    def mouse_move(self, x: int, y: int):
        self._execute_xdotool_command(f"mousemove {x} {y}")
    def left_down(self):
        self._execute_xdotool_command(f"mousedown 1")
    def left_up(self):
        self._execute_xdotool_command(f"mouseup 1")
    def left_click(self):
        self._execute_xdotool_command(f"click 1")
    def middle_down(self):
        self._execute_xdotool_command(f"mousedown 2")
    def middle_up(self):
        self._execute_xdotool_command(f"mouseup 2")
    def middle_click(self):
        self._execute_xdotool_command(f"click 2")
    def right_down(self):
        self._execute_xdotool_command(f"mousedown 3")
    def right_up(self):
        self._execute_xdotool_command(f"mouseup 3")
    def right_click(self):
        self._execute_xdotool_command(f"click 3")
    def scroll_up(self):
        self._execute_xdotool_command(f"click 4")
    def scroll_down(self):
        self._execute_xdotool_command(f"click 5")
 class PythonMouseController(AbstractMouseController, PythonController):
    def __init__(self, http_server: str):
        super().__init__(http_server=http_server)
        self.command = "python -c \"import mouse; {command}\""
    def mouse_move(self, x: int, y: int):
        self._execute_python_command(self.command.format(command=f"mouse.move({x}, {y})"))
    def left_down(self):
        self._execute_python_command(self.command.format(command="mouse.press(button='left')"))
    def left_up(self):
        self._execute_python_command(self.command.format(command="mouse.release(button='left')"))
    def left_click(self):
        self._execute_python_command(self.command.format(command="mouse.click(button='left')"))
    def middle_down(self):
        self._execute_python_command(self.command.format(command="mouse.press(button='middle')"))
    def middle_up(self):
        self._execute_python_command(self.command.format(command="mouse.release(button='middle')"))
    def middle_click(self):
        self._execute_python_command(self.command.format(command="mouse.click(button='middle')"))
    def right_down(self):
        self._execute_python_command(self.command.format(command="mouse.press(button='right')"))
    def right_up(self):
        self._execute_python_command(self.command.format(command="mouse.release(button='right')"))
    def right_click(self):
        self._execute_python_command(self.command.format(command="mouse.click(button='right')"))
    def scroll_up(self):
        self._execute_python_command(self.command.format(command="mouse.wheel(10)"))
    def scroll_down(self):
        self._execute_python_command(self.command.format(command="mouse.wheel(-10)"))
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -1,34 +1,208 @@
 import requests
 import json
 from typing import Any, Dict
 import requests
 from desktop_env.envs.actions import KEYBOARD_KEYS
 class PythonController:
-    def __init__(self, http_server: str):
+    def __init__(self, http_server: str, pkgs_prefix: str = "python -c \"import pyautogui; {command}\""):
        self.http_server = http_server
-    
+        self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
-    def _execute_python_command(self, command: str) -> None:
+
-        payload = json.dumps({
+    def get_screenshot(self):
-            "command": command
+        """
-        })
+        Gets a screenshot from the server. With the cursor.
        """
        response = requests.get(self.http_server + "/screenshot")
        if response.status_code == 200:
            return response.content
        else:
            print("Failed to get screenshot. Status code:", response.status_code)
            return None
    def get_file(self, file_path: str):
        """
        Gets a file from the server.
        """
        response = requests.post(self.http_server + "/file", data={"file_path": file_path})
        if response.status_code == 200:
            print("File downloaded successfully")
            return response.content
        else:
            print("Failed to get file. Status code:", response.status_code)
            return None
    def execute_python_command(self, command: str) -> None:
        """
        Executes a python command on the server.
        It can be used to execute the pyautogui commands, or... any other python command. who knows?
        """
        command = self.pkgs_prefix.format(command=command)
        payload = json.dumps({"command": command})
        headers = {
            'Content-Type': 'application/json'
        }
-        
+
        try:
            response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
            if response.status_code == 200:
                print("Command executed successfully:", response.text)
            else:
                print("Failed to execute command. Status code:", response.status_code)
            return response.json()
        except requests.exceptions.RequestException as e:
            print("An error occurred while trying to execute the command:", e)
-# example usage
+    def execute_action(self, action: Dict[str, Any]):
-if __name__ == '__main__':
+        """
-    # replace with your actual server URL of the vm
+        Executes an action on the server computer.
-    server_url = "http://192.168.7.129:5000"  
+        """
    controller = PythonController(server_url)
-    # example commands    
+        action_type = action["action_type"]
-    python_command = "python -c \"import keyboard; keyboard.write('hello world')\""
+        parameters = action["parameters"] if "parameters" in action else {}
-    python_command = "python -c \"import mouse; mouse.move(100,100);mouse.right_click()\""
+
-    controller._execute_python_command(python_command)
+        if action_type == "MOVE_TO":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.moveTo()")
            elif "x" in parameters and "y" in parameters:
                x = parameters["x"]
                y = parameters["y"]
                self.execute_python_command(f"pyautogui.moveTo({x}, {y})")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "CLICK":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.click()")
            elif "button" in parameters and "x" in parameters and "y" in parameters:
                button = parameters["button"]
                x = parameters["x"]
                y = parameters["y"]
                if "num_clicks" in parameters:
                    num_clicks = parameters["num_clicks"]
                    self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y}, clicks={num_clicks})")
                else:
                    self.execute_python_command(f"pyautogui.click(button='{button}', x={x}, y={y})")
            elif "button" in parameters and "x" not in parameters and "y" not in parameters:
                button = parameters["button"]
                if "num_clicks" in parameters:
                    num_clicks = parameters["num_clicks"]
                    self.execute_python_command(f"pyautogui.click(button='{button}', clicks={num_clicks})")
                else:
                    self.execute_python_command(f"pyautogui.click(button='{button}')")
            elif "button" not in parameters and "x" in parameters and "y" in parameters:
                x = parameters["x"]
                y = parameters["y"]
                if "num_clicks" in parameters:
                    num_clicks = parameters["num_clicks"]
                    self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={num_clicks})")
                else:
                    self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "MOUSE_DOWN":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.mouseDown()")
            elif "button" in parameters:
                button = parameters["button"]
                self.execute_python_command(f"pyautogui.mouseDown(button='{button}')")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "MOUSE_UP":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.mouseUp()")
            elif "button" in parameters:
                button = parameters["button"]
                self.execute_python_command(f"pyautogui.mouseUp(button='{button}')")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "RIGHT_CLICK":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.rightClick()")
            elif "x" in parameters and "y" in parameters:
                x = parameters["x"]
                y = parameters["y"]
                self.execute_python_command(f"pyautogui.rightClick(x={x}, y={y})")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "DOUBLE_CLICK":
            if parameters == {} or None:
                self.execute_python_command(f"pyautogui.doubleClick()")
            elif "x" in parameters and "y" in parameters:
                x = parameters["x"]
                y = parameters["y"]
                self.execute_python_command(f"pyautogui.doubleClick(x={x}, y={y})")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "DRAG_TO":
            if "x" in parameters and "y" in parameters:
                x = parameters["x"]
                y = parameters["y"]
                self.execute_python_command(f"pyautogui.dragTo({x}, {y}, duration=1.0, button='left', mouseDownUp=True)")
        elif action_type == "SCROLL":
            # todo: check if it is related to the operating system, as https://github.com/TheDuckAI/DuckTrack/blob/main/ducktrack/playback.py pointed out
            if "dx" in parameters and "dy" in parameters:
                dx = parameters["dx"]
                dy = parameters["dy"]
                self.execute_python_command(f"pyautogui.hscroll({dx})")
                self.execute_python_command(f"pyautogui.vscroll({dy})")
            elif "dx" in parameters and "dy" not in parameters:
                dx = parameters["dx"]
                self.execute_python_command(f"pyautogui.hscroll({dx})")
            elif "dx" not in parameters and "dy" in parameters:
                dy = parameters["dy"]
                self.execute_python_command(f"pyautogui.vscroll({dy})")
            else:
                raise Exception(f"Unknown parameters: {parameters}")
        elif action_type == "TYPING":
            if "text" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            text = parameters["text"]
            self.execute_python_command(f"pyautogui.typewrite('{text}')")
        elif action_type == "PRESS":
            if "key" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            key = parameters["key"]
            if key.lower() not in KEYBOARD_KEYS:
                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
            self.execute_python_command(f"pyautogui.press('{key}')")
        elif action_type == "KEY_DOWN":
            if "key" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            key = parameters["key"]
            if key.lower() not in KEYBOARD_KEYS:
                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
            self.execute_python_command(f"pyautogui.keyDown('{key}')")
        elif action_type == "KEY_UP":
            if "key" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            key = parameters["key"]
            if key.lower() not in KEYBOARD_KEYS:
                raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
            self.execute_python_command(f"pyautogui.keyUp('{key}')")
        elif action_type == "HOTKEY":
            if "keys" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            keys = parameters["keys"]
            if not isinstance(keys, list):
                raise Exception(f"Keys must be a list of keys")
            for key in keys:
                if key.lower() not in KEYBOARD_KEYS:
                    raise Exception(f"Key must be one of {KEYBOARD_KEYS}")
            keys_para_rep = "', '".join(keys)
            self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
        else:
            raise Exception(f"Unknown action type: {action_type}")
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -0,0 +1,96 @@
 import requests
 import json
 class SetupController:
    def __init__(self, http_server: str):
        self.http_server = http_server + "/setup"
    def setup(self, config):
        """
        Setup Config:
        {
            download: list[tuple[string]], # a list of tuples of url of file to download and the save path
            ...
        }
        """
        self._download_setup(config)
        self._change_wallpaper(config)
        # self._tidy_desktop(config) todo: implement this
        self._open_setup(config)
        # can add other setup steps
    def _download_setup(self, config):
        if not config:
            return
        if not 'download' in config:
            return
        for url, path in config['download']:
            if not url or not path:
                raise Exception(f"Setup Download - Invalid URL ({url}) or path ({path}).")
            payload = json.dumps({"url": url, "path": path})
            headers = {
                'Content-Type': 'application/json'
            }
            # send request to server to download file
            try:
                response = requests.post(self.http_server + "/download_file", headers=headers, data=payload)
                if response.status_code == 200:
                    print("Command executed successfully:", response.text)
                else:
                    print("Failed to download file. Status code:", response.text)
            except requests.exceptions.RequestException as e:
                print("An error occurred while trying to send the request:", e)
    def _change_wallpaper(self, config):
        if not config:
            return
        if not 'wallpaper' in config:
            return
        path = config['wallpaper']
        if not path:
            raise Exception(f"Setup Wallpaper - Invalid path ({path}).")
        payload = json.dumps({"path": path})
        headers = {
            'Content-Type': 'application/json'
        }
        # send request to server to change wallpaper
        try:
            response = requests.post(self.http_server + "/change_wallpaper", headers=headers, data=payload)
            if response.status_code == 200:
                print("Command executed successfully:", response.text)
            else:
                print("Failed to change wallpaper. Status code:", response.text)
        except requests.exceptions.RequestException as e:
            print("An error occurred while trying to send the request:", e)
    def _tidy_desktop(self, config):
        raise NotImplementedError
    def _open_setup(self, config):
        if not config:
            return
        if not 'open' in config:
            return
        for path in config['open']:
            if not path:
                raise Exception(f"Setup Open - Invalid path ({path}).")
            payload = json.dumps({"path": path})
            headers = {
                'Content-Type': 'application/json'
            }
            # send request to server to open file
            try:
                response = requests.post(self.http_server + "/open_file", headers=headers, data=payload)
                if response.status_code == 200:
                    print("Command executed successfully:", response.text)
                else:
                    print("Failed to open file. Status code:", response.text)
            except requests.exceptions.RequestException as e:
                print("An error occurred while trying to send the request:", e)
--- a/desktop_env/controllers/xdotool.py
+++ b/desktop_env/controllers/xdotool.py
@@ -1,11 +0,0 @@
 from fabric import Connection
 from typing import List
 class XDoToolController:
    def __init__(self, ssh_connection: Connection):
        self.ssh_connection = ssh_connection
    def _execute_xdotool_command(self, command: List[str]) -> None:
        result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
        return result.stdout.strip()
--- a/desktop_env/envs/actions.py
+++ b/desktop_env/envs/actions.py
@@ -0,0 +1,190 @@
 X_MAX = 1920  # TODO: get the screen resolution
 Y_MAX = 1080
 KEYBOARD_KEYS = ['\t', '\n', '\r', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', 'accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright']
 ACTION_SPACE = [
    {
        "action_type": "MOVE_TO",
        "note": "move the cursor to the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": False,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "CLICK",
        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            },
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            },
            "num_clicks": {
                "type": int,
                "range": [1, 2, 3],
                "optional": True,
            },
        }
    },
    {
        "action_type": "MOUSE_DOWN",
        "note": "press the left button if the button not specified, otherwise press the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "MOUSE_UP",
        "note": "release the left button if the button not specified, otherwise release the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "RIGHT_CLICK",
        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DOUBLE_CLICK",
        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DRAG_TO",
        "note": "drag the cursor to the specified position with the left button pressed",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": False,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "SCROLL",
        "note": "scroll the mouse wheel up or down",
        "parameters": {
            "dx": {
                "type": int,
                "range": None,
                "optional": False,
            },
            "dy": {
                "type": int,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "TYPING",
        "note": "type the specified text",
        "parameters": {
            "text": {
                "type": str,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "PRESS",
        "note": "press the specified key and release it",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_DOWN",
        "note": "press the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_UP",
        "note": "release the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "HOTKEY",
        "note": "press the specified key combination",
        "parameters": {
            "keys": {
                "type": list,
                "range": [KEYBOARD_KEYS],
                "optional": False,
            }
        }
    }
 ]
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,78 +1,61 @@
-from enum import Enum
+from __future__ import annotations
-from typing import Literal, List, Tuple
+
 import os
 import subprocess
 from fabric import Connection
 import time
 import uuid
 import platform
 from typing import List
 import gymnasium as gym
-from gymnasium import spaces
+import requests
 import numpy as np
 from PIL import Image
-from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
+from desktop_env.controllers.python import PythonController
-from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
+from desktop_env.controllers.setup import SetupController
-
+from desktop_env.evaluators import eval_funcs
 class Action(Enum):
    CLICK = 0
    MOUSE_DOWN = 1
    MOUSE_UP = 2
    MOUSE_MOVE = 3
    KEY = 4
    KEY_DOWN = 5
    KEY_UP = 6
    TYPE = 7
-VM_TYPE = Literal['ubuntu', 'windows']
+def _execute_command(command: List[str]) -> None:
    if command[:4] == ["vmrun", "-T", "ws", "start"]:
        p = subprocess.Popen(command)
        p.wait()
    else:
        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
        if result.returncode != 0:
            raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
        return result.stdout
 class DesktopEnv(gym.Env):
    """DesktopEnv with OpenAI Gym interface."""
-    def __init__(self, path_to_vm: str, username: str, password: str,
+    def __init__(
-                 host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
+            self,
            path_to_vm: str,
            snapshot_path: str = "base",
            instruction: str = None,
            config: dict = None,
            evaluator: dict = None,
            action_space: str = "computer_13",
    ):
        # Initialize environment variables
        self.path_to_vm = path_to_vm
        self.username = username
        self.password = password
        self.host = host
        self.snapshot_path = snapshot_path  # todo: handling the logic of snapshot directory
-        self.screen_width = 800
+        # Initialize emulator and controller
        self.screen_height = 800
        # Define the action and observation space
        self.action_space = spaces.Dict({
            "action_type": spaces.Discrete(len(Action)),
            "click_type": spaces.Discrete(len(MouseClick)),
            "x": spaces.Discrete(self.screen_width),
            "y": spaces.Discrete(self.screen_height),
            "key": spaces.MultiDiscrete([128] * 10),  # max 10 characters, ASCII
            "text": spaces.MultiDiscrete([128] * 10)  # max 10 characters, ASCII
        })
        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
        # Additional setup
        self.metadata = {'render.modes': ['rgb_array']}
        # Initialize emulator
        print("Initializing...")
        self._start_emulator()
        self.host = f"http://{self._get_vm_ip()}:5000"
        self.controller = PythonController(http_server=self.host)
        self.setup_controller = SetupController(http_server=self.host)
        self.instruction = instruction
        self.config = config
        self.evaluator = evaluator
-        # set up controllers
+        # mode: human or machine
-        self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os)
+        assert action_space in ["computer_13", "pyautogui"]
-
+        self.action_space = action_space
-    def _create_controllers(self, vm_os: VM_TYPE) -> Tuple[AbstractMouseController, AbstractKeyboardController]:
+        # todo: define the action space and the observation space as gym did, or extend theirs
        if vm_os == "ubuntu":
            ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password})
            mouse_controller = XDoToolMouseController(ssh_connection)
            keyboard_controller = XDoToolKeyboardController(ssh_connection)
        elif vm_os == "windows":
            mouse_controller = PythonMouseController(http_server=self.host)
            keyboard_controller = PythonKeyboardController(http_server=self.host)
        else:
            raise NotImplementedError(vm_os)
        return mouse_controller, keyboard_controller
    def _start_emulator(self):
        while True:
@@ -84,108 +67,120 @@ class DesktopEnv(gym.Env):
                    break
                else:
                    print("Starting VM...")
-                    self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
+                    _execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
-                    time.sleep(5)
+                    time.sleep(3)
            except subprocess.CalledProcessError as e:
                print(f"Error executing command: {e.output.decode().strip()}")
-    def _execute_command(self, command: List[str]) -> None:
+    def _get_vm_ip(self):
-        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
+        max_retries = 10
-        stdout, stderr = process.communicate()
+        print("Getting IP Address...")
-        if process.returncode != 0:
+        for _ in range(max_retries):
-            print(f"Error executing command: {command}")
+            try:
-            return None
+                output = _execute_command(["vmrun", "-T", "ws", "getGuestIPAddress", self.path_to_vm]).strip()
-        else:
+                print(f"IP address: {output}")
-            return stdout.decode()
+                return output
            except:
                time.sleep(5)
                print("Retrying...")
        raise Exception("Failed to get VM IP address!")
    def _save_state(self):
-        self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
    def _get_screenshot(self):
-        image_path = "./screenshot.png"
+        random_uuid = str(uuid.uuid4())
-        self._execute_command(
+        os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
-            ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
+        image_path = os.path.join("tmp", random_uuid, "screenshot.png")
-             image_path])
+
        # Get the screenshot and save to the image_path
        screenshot = self.controller.get_screenshot()
        with open(image_path, "wb") as f:
            f.write(screenshot)
        return image_path
    def _get_obs(self):
        screenshot_image_path = self._get_screenshot()
-        with Image.open(screenshot_image_path) as img:
+        return screenshot_image_path
            return np.array(img)
-    def reset(self):
+    def reset(self, seed=None, options=None):
        print("Resetting environment...")
        print("Reverting to snapshot to {}...".format(self.snapshot_path))
-        self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
        time.sleep(5)
        print("Starting emulator...")
        self._start_emulator()
        print("Emulator started.")
        print("Setting up environment...")
        self.setup_controller.setup(self.config)
        time.sleep(5)
        print("Environment setup complete.")
        observation = self._get_obs()
        return observation
-    def step(self, action):
+    def step(self, action, pause=0.5):
-        action_type = Action(action['action_type'])
+        # fixme: add reminding logic here, decide if the action is valid for the current action_space
-        if action_type == Action.CLICK:
+        if self.action_space == "computer_13":
-            click = MouseClick(action['click_type'])
+            # the set of all possible actions defined in the action representation
-            if click == MouseClick.LEFT:
+            self.controller.execute_action(action)
-                self.mouse_controller.left_click()
+        elif self.action_space == "pyautogui":
-            elif click == MouseClick.MIDDLE:
+            # the set of all possible python commands insides `pyautogui`
-                self.mouse_controller.middle_click()
+            self.controller.execute_python_command(action)
            elif click == MouseClick.RIGHT:
                self.mouse_controller.right_click()
            elif click == MouseClick.WHEEL_UP:
                self.mouse_controller.scroll_up()
            elif click == MouseClick.WHEEL_DOWN:
                self.mouse_controller.scroll_down()
        elif action_type == Action.MOUSE_DOWN:
            click = MouseClick(action['click_type'])
            if click == MouseClick.LEFT:
                self.mouse_controller.left_down()
            elif click == MouseClick.MIDDLE:
                self.mouse_controller.middle_down()
            elif click == MouseClick.RIGHT:
                self.mouse_controller.right_down()
            elif click == MouseClick.WHEEL_UP:
                self.mouse_controller.scroll_up()
            elif click == MouseClick.WHEEL_DOWN:
                self.mouse_controller.scroll_down()
        elif action_type == Action.MOUSE_UP:
            click = MouseClick(action['click_type'])
            if click == MouseClick.LEFT:
                self.mouse_controller.left_up()
            elif click == MouseClick.MIDDLE:
                self.mouse_controller.middle_up()
            elif click == MouseClick.RIGHT:
                self.mouse_controller.right_up()
            elif click == MouseClick.WHEEL_UP:
                self.mouse_controller.scroll_up()
            elif click == MouseClick.WHEEL_DOWN:
                self.mouse_controller.scroll_down()
        elif action_type == Action.MOUSE_MOVE:
            self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
        elif action_type == Action.KEY:
            key_sequence = ''.join(map(chr, action['key']))  # Convert integer array to string
            self.keyboard_controller.key(key_sequence)
        elif action_type == Action.KEY_DOWN:
            key_sequence = ''.join(map(chr, action['key']))  # Convert integer array to string
            self.keyboard_controller.key_down(key_sequence)
        elif action_type == Action.KEY_UP:
            key_sequence = ''.join(map(chr, action['key']))  # Convert integer array to string
            self.keyboard_controller.key_up(key_sequence)
        elif action_type == Action.TYPE:
            text = ''.join(map(chr, action['text']))  # Convert integer array to string
            self.keyboard_controller.type(text)
-        # Capture new state
+        # todo: maybe for the better here we need to add a logic to wait until the rendering is done
-        observation = self._get_obs()
+        time.sleep(pause)
-        reward = 0  # Define reward calculation
+        observation = {
-        done = False  # Define episode termination condition
+            "screenshot": self._get_obs(),
            "instruction": self.instruction
        }
        reward = 0  # todo: Define reward calculation for each example
        done = False  # todo: Define episode termination condition for each example
        info = {}
        return observation, reward, done, info
    def evaluate(self):
        """
        Evaluate whether the task is successfully completed.
        """
        def copy_file_to_local(_file_info):
            random_uuid = str(uuid.uuid4())
            os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
            _path = os.path.join("tmp", random_uuid, "tmp.xlsx")
            if _file_info["type"] == "cloud_file":
                url = _file_info["path"]
                response = requests.get(url, stream=True)
                response.raise_for_status()
                with open(_path, 'wb') as f:
                    for chunk in response.iter_content(chunk_size=8192):
                        if chunk:
                            f.write(chunk)
            elif _file_info["type"] == "vm_file":
                # fixme: stream this part maybe as well
                file = self.controller.get_file(_file_info["path"])
                with open(_path, "wb") as f:
                    f.write(file)
            else:
                raise NotImplementedError
            return _path
        # todo: make this more flexible by refactoring
        eval_func = eval_funcs[self.evaluator["func"]]
        eval_func_vars = {}
        for var_name, file_info in self.evaluator["paths"].items():
            path = copy_file_to_local(file_info)
            eval_func_vars[var_name] = path
        return eval_func(**eval_func_vars)
    def render(self, mode='rgb_array'):
        if mode == 'rgb_array':
            return self._get_obs()
@@ -193,4 +188,4 @@ class DesktopEnv(gym.Env):
            raise ValueError('Unsupported render mode: {}'.format(mode))
    def close(self):
-        self._execute_command(["vmrun", "stop", self.path_to_vm])
+        _execute_command(["vmrun", "stop", self.path_to_vm])
--- a/desktop_env/evaluators/init.py
+++ b/desktop_env/evaluators/init.py
@@ -0,0 +1,5 @@
 from .table import compare_table
 eval_funcs = {
    "compare_table(expected, actual)": compare_table
 }
--- a/desktop_env/evaluators/replay.py
+++ b/desktop_env/evaluators/replay.py
--- a/desktop_env/evaluators/table.py
+++ b/desktop_env/evaluators/table.py
@@ -0,0 +1,14 @@
 def compare_table(expected, actual):
    import pandas as pd
    df1 = pd.read_excel(expected)
    df2 = pd.read_excel(actual)
    # Compare the DataFrames
    return 1 if df1.equals(df2) else 0
 if __name__ == '__main__':
    path1 = ""
    path2 = ""
    print(compare_table(path1, path2))
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -0,0 +1,184 @@
 import os
 from pathlib import Path
 import platform
 import subprocess
 import requests
 import Xlib.display
 import pyautogui
 from PIL import ImageGrab, Image
 from flask import Flask, request, jsonify, send_file
 app = Flask(__name__)
 pyautogui.PAUSE = 0
 pyautogui.DARWIN_CATCH_UP_TIME = 0
@app.route('/execute', methods=['POST'])
 def execute_command():
    data = request.json
    # The 'command' key in the JSON request should contain the command to be executed.
    command = data.get('command', '')
    # Execute the command without any safety checks.
    try:
        result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        return jsonify({
            'status': 'success',
            'output': result.stdout,
            'error': result.stderr
        })
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': str(e)
        }), 500
@app.route('/screenshot', methods=['GET'])
 def capture_screen_with_cursor():
    file_path = os.path.join("screenshots", "screenshot.png")
    user_platform = platform.system()
    # Ensure the screenshots directory exists
    os.makedirs(os.path.dirname(file_path), exist_ok=True)
    if user_platform == "Windows":
        def _download_image(url, path):
            response = requests.get(url)
            with open(path, 'wb') as file:
                file.write(response.content)
        cursor_path = os.path.join("screenshots", "cursor.png")
        if not os.path.exists(cursor_path):
            cursor_url = "https://vip.helloimg.com/images/2023/12/02/oQPzmt.png"
            _download_image(cursor_url, cursor_path)
        screenshot = pyautogui.screenshot()
        cursor_x, cursor_y = pyautogui.position()
        cursor = Image.open(cursor_path)
        screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
        screenshot.save(file_path)
    elif user_platform == "Linux":
        # Use xlib to prevent scrot dependency for Linux
        screen = Xlib.display.Display().screen()
        size = screen.width_in_pixels, screen.height_in_pixels
        screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
        screenshot.save(file_path)
    elif user_platform == "Darwin":  # (Mac OS)
        # Use the screencapture utility to capture the screen with the cursor
        subprocess.run(["screencapture", "-C", file_path])
    else:
        print(f"The platform you're using ({user_platform}) is not currently supported")
    return send_file(file_path, mimetype='image/png')
@app.route('/file', methods=['POST'])
 def get_file():
    # Retrieve filename from the POST request
    if 'file_path' in request.form:
        file_path = request.form['file_path']
    else:
        return jsonify({"error": "file_path is required"}), 400
    try:
        # Check if the file exists and send it to the user
        return send_file(file_path, as_attachment=True)
    except FileNotFoundError:
        # If the file is not found, return a 404 error
        return jsonify({"error": "File not found"}), 404
@app.route('/platform', methods=['GET'])
 def get_platform():
    return platform.system()
@app.route('/cursor_position', methods=['GET'])
 def get_cursor_position():
    return pyautogui.position().x, pyautogui.position().y
@app.route("/setup/change_wallpaper", methods=['POST'])
 def change_wallpaper():
    data = request.json
    path = data.get('path', None)
    if not path:
        return "Path not supplied!", 400
    path = Path(path)
    if not path.exists():
        return f"File not found: {path}", 404
    try:
        user_platform = platform.system()
        if user_platform == "Windows":
            import ctypes
            ctypes.windll.user32.SystemParametersInfoW(20, 0, str(path), 3)
        elif user_platform == "Linux":
            import subprocess
            subprocess.run(["gsettings", "set", "org.gnome.desktop.background", "picture-uri", f"file://{path}"])
        elif user_platform == "Darwin":  # (Mac OS)
            import subprocess
            subprocess.run(
                ["osascript", "-e", f'tell application "Finder" to set desktop picture to POSIX file "{path}"'])
        return "Wallpaper changed successfully"
    except Exception as e:
        return f"Failed to change wallpaper. Error: {e}", 500
@app.route("/setup/download_file", methods=['POST'])
 def download_file():
    data = request.json
    url = data.get('url', None)
    path = data.get('path', None)
    if not url or not path:
        return "Path or URL not supplied!", 400
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    max_retries = 3
    for i in range(max_retries):
        try:
            response = requests.get(url, stream=True)
            response.raise_for_status()
            with open(path, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:
                        f.write(chunk)
            return "File downloaded successfully"
        except requests.RequestException as e:
            print(f"Failed to download {url}. Retrying... ({max_retries - i - 1} attempts left)")
    return f"Failed to download {url}. No retries left. Error: {e}", 500
@app.route("/setup/open_file", methods=['POST'])
 def open_file():
    data = request.json
    path = data.get('path', None)
    if not path:
        return "Path not supplied!", 400
    path = Path(path)
    if not path.exists():
        return f"File not found: {path}", 404
    try:
        os.startfile(path)
        return "File opened successfully"
    except Exception as e:
        return f"Failed to open {path}. Error: {e}", 500
 if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0")
--- a/desktop_env/server/requirements.txt
+++ b/desktop_env/server/requirements.txt
@@ -0,0 +1,5 @@
 python3-xlib==0.15
 PyAutoGUI==0.9.54
 Pillow==10.1.0
 git+https://github.com/moses-palmer/pynput.git@refs/pull/541/head # to make sure that it works on Apple Silicon
 requests
--- a/desktop_env/windows_server/main.py
+++ b/desktop_env/windows_server/main.py
@@ -1,29 +0,0 @@
 from flask import Flask, request, jsonify
 import subprocess
 app = Flask(__name__)
@app.route('/execute', methods=['POST'])
 def execute_command():
    data = request.json
    # The 'command' key in the JSON request should contain the command to be executed.
    command = data.get('command', '')
    # Execute the command without any safety checks.
    try:
        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        stdout, stderr = process.communicate()
        return jsonify({
            'status': 'success',
            'output': stdout.decode(),
            'error': stderr.decode()
        })
    except Exception as e:
        return jsonify({
            'status': 'error',
            'message': str(e)
        }), 500
 if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0")
--- a/evaluation_examples/README.md
+++ b/evaluation_examples/README.md
@@ -0,0 +1,24 @@
 # Evaluation examples
 Here we put the data examples to benchmark the ability of agents when interacting with GUI.
 The examples are stored in `./examples` where each data item formatted as:
 ```
 {
    "id": "uid", # unique id
    "snapshot": "snapshot_id", # the snapshot id of the environment, with some data already there and apps already opened, or just desktop
    "instruction": "natural_language_instruction", # the natural language instruction of the task, what we want the agent to do
    "source": "website_url", # where we know this example, some forum, or some website, or some paper
    "config": {xxx}, # the scripts to setup the donwload and open files actions, as the initial state of a task
    "trajectory": "trajectory_directory", # the trajectory directory, which contains the action sequence file, the screenshots and the recording video
    "related_apps": ["app1", "app2", ...], # the related apps, which are opened during the task
    "evaluator": "evaluation_dir", # the directory of the evaluator, which contains the evaluation script for this example
 …
 }
 ```
 The `./trajectories` file contains the annotated trajectories for each data item in `./examples` for finishing the task.
 For now, it is under construction, and only tested on Windows 10. Please:
 - Modify the path accordingly to run the evaluation;
 - Remind us if some parts are overfit to our environment.
--- a/evaluation_examples/examples/0bf05a7d-b28b-44d2-955a-50b41e24012a.json
+++ b/evaluation_examples/examples/0bf05a7d-b28b-44d2-955a-50b41e24012a.json
@@ -0,0 +1,22 @@
 {
  "id": "0bf05a7d-b28b-44d2-955a-50b41e24012a",
  "snapshot": "libreoffice_calc",
  "instruction": "I would like to pad all the numbers in the 'Old ID' column with zeros in front, to fill them up to seven digits in the 'New 7 Digit ID' column.",
  "source": "https://www.youtube.com/shorts/FPAQaDTS8VY",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\Customers_New_7digit_Id.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Customers_New_7digit_Id.xlsx"
    ]
  },
  "trajectory": "trajectories/0bf05a7d-b28b-44d2-955a-50b41e24012a",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/2bd59342-0664-4ccb-ba87-79379096cc08.json
+++ b/evaluation_examples/examples/2bd59342-0664-4ccb-ba87-79379096cc08.json
@@ -0,0 +1,22 @@
 {
  "id": "2bd59342-0664-4ccb-ba87-79379096cc08",
  "snapshot": "libreoffice_calc",
  "instruction": "Make sparkline chart line by line",
  "source": "https://www.youtube.com/shorts/L3Z-F1QTQFY",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\OrderId_Month_Chart.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\OrderId_Month_Chart.xlsx"
    ]
  },
  "trajectory": "trajectories/2bd59342-0664-4ccb-ba87-79379096cc08",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json
+++ b/evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json
@@ -0,0 +1,34 @@
 {
  "id": "37608790-6147-45d0-9f20-1137bb35703d",
  "snapshot": "libreoffice_calc",
  "instruction": "Help me fill the columns of First Name, Last Name and Rank",
  "source": "https://www.youtube.com/shorts/uzPo_CPCHH8",
  "config": {
    "download": [
      [
        "https://drive.usercontent.google.com/download?id=1wDqap5cBfxnlqTNrZG61k_wDWTujl6AU&export=download&authuser=0&confirm=t&uuid=fd183b89-76b7-4dc5-880e-1045ed769562&at=APZUnTWp9RMafMg0xohhBWazN3YD:1701785710674",
        "C:\\Users\\tianbaox\\Desktop\\Employee_Roles_and_Ranks.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Employee_Roles_and_Ranks.xlsx"
    ]
  },
  "trajectory": "trajectories/37608790-6147-45d0-9f20-1137bb35703d",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": {
    "func": "compare_table(expected, actual)",
    "paths": {
      "expected": {
        "type": "cloud_file",
        "path": "https://drive.usercontent.google.com/download?id=1dxpiUqP_CVvQp5tddxlwO3Cp1BqJ-ZDE&export=download&authuser=0&confirm=t&uuid=ccd204c7-07ce-4fdf-a5d4-a7e4f37b9ce6&at=APZUnTVBs7TgrVrDXpkiU8S7WbQo:1702360836747"
      },
      "actual": {
        "type": "vm_file",
        "path": "C:\\Users\\tianbaox\\Desktop\\Employee_Roles_and_Ranks.xlsx"
      }
    }
  }
 }
--- a/evaluation_examples/examples/7a4e4bc8-922c-4c84-865c-25ba34136be1.json
+++ b/evaluation_examples/examples/7a4e4bc8-922c-4c84-865c-25ba34136be1.json
@@ -0,0 +1,22 @@
 {
  "id": "7a4e4bc8-922c-4c84-865c-25ba34136be1",
  "snapshot": "libreoffice_calc",
  "instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
  "source": "https://www.youtube.com/shorts/bvUhr1AHs44",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\Name_Order_Id_move_column.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Name_Order_Id_move_column.xlsx"
    ]
  },
  "trajectory": "trajectories/7a4e4bc8-922c-4c84-865c-25ba34136be1",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
+++ b/evaluation_examples/examples/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
@@ -0,0 +1,22 @@
 {
  "id": "7b802dad-6e0f-4204-9815-d4e3f57627d8",
  "snapshot": "libreoffice_calc",
  "instruction": "I would like to sort this table based on cell color, placing all the rows marked with pink at the beginning, while keeping their order among themselves unchanged.",
  "source": "https://www.youtube.com/shorts/Of-lzeP1usE",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\Customer_Sort_by_cell_color.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Customer_Sort_by_cell_color.xlsx"
    ]
  },
  "trajectory": "trajectories/7b802dad-6e0f-4204-9815-d4e3f57627d8",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/7efeb4b1-3d19-4762-b163-63328d66303b.json
+++ b/evaluation_examples/examples/7efeb4b1-3d19-4762-b163-63328d66303b.json
@@ -0,0 +1,22 @@
 {
  "id": "7efeb4b1-3d19-4762-b163-63328d66303b",
  "snapshot": "libreoffice_calc",
  "instruction": "Fill in the Serieal Numbers in \"Serial #\" column",
  "source": "https://www.youtube.com/shorts/4jzXfZNhfmk",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\Order_Sales_Serial#.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Order_Sales_Serial#.xlsx"
    ]
  },
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/a9f325aa-8c05-4e4f-8341-9e4358565f4f.json
+++ b/evaluation_examples/examples/a9f325aa-8c05-4e4f-8341-9e4358565f4f.json
@@ -0,0 +1,22 @@
 {
  "id": "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
  "snapshot": "libreoffice_calc",
  "instruction": "Clean the messy movie titles and put them in the cleaned column",
  "source": "https://www.youtube.com/shorts/A0gmEBRKXWs",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\"
    ]
  },
  "trajectory": "trajectories/a9f325aa-8c05-4e4f-8341-9e4358565f4f",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/d681960f-7bc3-4286-9913-a8812ba3261a.json
+++ b/evaluation_examples/examples/d681960f-7bc3-4286-9913-a8812ba3261a.json
@@ -0,0 +1,34 @@
 {
  "id": "d681960f-7bc3-4286-9913-a8812ba3261a",
  "snapshot": "libreoffice_calc",
  "instruction": "According to the green table shown above, calculate and give each student a grade",
  "source": "https://www.youtube.com/shorts/d7U1S_IsTVM",
  "config": {
    "download": [
      [
        "https://drive.usercontent.google.com/download?id=1wodZjx1KjThUsrtF6ZJaCTy1fQX4E9vA&export=download&authuser=0&confirm=t&uuid=d07ca312-1abc-40f2-81cd-d06e27119854&at=APZUnTWwjnxsHQYapSvpLR8NmlfV:1701785087048",
        "C:\\Users\\tianbaox\\Desktop\\Student_Grades_and_Remarks.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Student_Grades_and_Remarks.xlsx"
    ]
  },
  "trajectory": "trajectories/d681960f-7bc3-4286-9913-a8812ba3261a",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": {
    "func": "compare_table(expected, actual)",
    "paths": {
      "expected": {
        "type": "cloud_file",
        "path": "https://drive.usercontent.google.com/download?id=1kfEHJH1n0yCsQp443IIFvdD9uWv0DWMr&export=download&authuser=0&confirm=t&uuid=d9907f65-8d39-4ecc-8747-b4ed7e6011f5&at=APZUnTXpPAnlh5sD6q-R8oQtqL6g:1702362952170"
      },
      "actual": {
        "type": "vm_file",
        "path": "C:\\Users\\tianbaox\\Desktop\\Student_Grades_and_Remarks.xlsx"
      }
    }
  }
 }
--- a/evaluation_examples/examples/eb03d19a-b88d-4de4-8a64-ca0ac66f426b.json
+++ b/evaluation_examples/examples/eb03d19a-b88d-4de4-8a64-ca0ac66f426b.json
@@ -0,0 +1,22 @@
 {
  "id": "eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
  "snapshot": "libreoffice_calc",
  "instruction": "Traverse the table and paste it below",
  "source": "https://www.youtube.com/shorts/t9JLUaT55UQ",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\"
    ]
  },
  "trajectory": "trajectories/eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/ecb0df7a-4e8d-4a03-b162-053391d3afaf.json
+++ b/evaluation_examples/examples/ecb0df7a-4e8d-4a03-b162-053391d3afaf.json
@@ -0,0 +1,22 @@
 {
  "id": "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
  "snapshot": "libreoffice_calc",
  "instruction": "Enable each cell in the column\"Pass/Fail/Held\" is a drop down list",
  "source": "https://www.youtube.com/shorts/tXOovKn0H68",
  "config": {
    "download": [
      [
        "",
        "C:\\Users\\tianbaox\\Desktop\\"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\"
    ]
  },
  "trajectory": "trajectories/ecb0df7a-4e8d-4a03-b162-053391d3afaf",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/evaluation_examples/examples/f9584479-3d0d-4c79-affa-9ad7afdd8850.json
+++ b/evaluation_examples/examples/f9584479-3d0d-4c79-affa-9ad7afdd8850.json
@@ -0,0 +1,34 @@
 {
  "id": "f9584479-3d0d-4c79-affa-9ad7afdd8850",
  "snapshot": "libreoffice_calc",
  "instruction": "Fill the missing row and column which show the total value",
  "source": "https://youtube.com/shorts/feldd-Pn48c?si=9xJiem2uAHm6Jshb",
  "config": {
    "download": [
      [
        "https://drive.usercontent.google.com/download?id=1rwhniaClEkF8XFzdfaNUA6GmAiy4syMZ&export=download&authuser=0&confirm=t&uuid=6fdd5b04-85f4-45e1-ad74-368f8f2a82ab&at=APZUnTUP-JxPxLfNls6jXWghblQ5:1701766091851",
        "C:\\Users\\tianbaox\\Desktop\\Quarterly_Product_Sales_by_Zone.xlsx"
      ]
    ],
    "open": [
      "C:\\Users\\tianbaox\\Desktop\\Quarterly_Product_Sales_by_Zone.xlsx"
    ]
  },
  "trajectory": "trajectories/f9584479-3d0d-4c79-affa-9ad7afdd8850",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": {
    "func": "compare_table(expected, actual)",
    "paths": {
      "expected": {
        "type": "cloud_file",
        "path": "https://drive.usercontent.google.com/download?id=17f1wZuJPvUEc5at_Fy3c18VFdOk0x7xz&export=download&authuser=0&confirm=t&uuid=6d2edffd-0ce0-426e-9820-8af25b4667f3&at=APZUnTVh7JS85dwZBaV2hytWQgDK:1702361510956"
      },
      "actual": {
        "type": "vm_file",
        "path": "C:\\Users\\tianbaox\\Desktop\\Quarterly_Product_Sales_by_Zone.xlsx"
      }
    }
  }
 }
--- a/evaluation_examples/examples/template.json
+++ b/evaluation_examples/examples/template.json
@@ -0,0 +1,13 @@
 {
  "id": "",
  "snapshot": "libreoffice_calc",
  "instruction": "",
  "source": "",
  "config": {
  },
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice calc"
  ],
  "evaluator": "evaluation_dir"
 }
--- a/main.py
+++ b/main.py
@@ -1,56 +1,51 @@
-from pprint import pprint
+import json
-from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
+from desktop_env.envs.desktop_env import DesktopEnv
 def get_human_action():
    """
    Prompts the human player for an action and returns a structured action.
    """
    print("\nAvailable actions:", [action.name for action in Action])
    action_type = None
    while action_type not in [action.value for action in Action]:
        action_type = Action[input("Enter the type of action: ".strip())].value
    action = {"action_type": action_type}
    if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
        print("\n Available clicks:", [action.name for action in MouseClick])
        click_type = input("Enter click type: ")
        action["click_type"] = MouseClick[click_type].value
    if action_type == Action.MOUSE_MOVE.value:
        x = int(input("Enter x-coordinate for mouse move: "))
        y = int(input("Enter y-coordinate for mouse move: "))
        action["x"] = x
        action["y"] = y
    if action_type == Action.KEY.value:
        key = input("Enter the key to press: ")
        action["key"] = [ord(c) for c in key]
    if action_type == Action.TYPE.value:
        text = input("Enter the text to type: ")
        action["text"] = [ord(c) for c in text]
    return action
 def human_agent():
    """
    Runs the Gym environment with human input.
    """
-    env = DesktopEnv(path_to_vm="/home/yuri/vmware/Windows 10 x64/Windows 10 x64.vmx",
+
-                    #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+    with open("evaluation_examples/examples/37608790-6147-45d0-9f20-1137bb35703d.json", "r") as f:
-                 username="user",
+        example = json.load(f)
-                 password="password",
+
-                #  host="192.168.7.128",
+    env = DesktopEnv(
-                 host="http://192.168.7.129:5000",
+        # path_to_vm=r"""C:\Users\tianbaox\Downloads\Windows 10 x64\Windows 10 x64.vmx""",
-                 vm_os="windows")
+        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""",
        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
        action_space="computer_13",
        snapshot_path="base_setup3",
        instruction=example["instruction"],
        config=example["config"],
        evaluator=example["evaluator"]
    )
    # reset the environment to certain snapshot
    observation = env.reset()
    done = False
-    while not done:
+    trajectory = [
-        action = get_human_action()
+        {
-        observation, reward, done, info = env.step(action)
+            "action_type": "MOVE_TO",
            "parameters": {
                "x": 754,
                "y": 1057
            }
        },
        {"action_type": "CLICK", "parameters": {"button": "right", "num_clicks": 1}}
    ]
    for i in range(len(trajectory)):
        # action = get_human_action()
        # action = {
        #     "action_type": 0,
        #     "click_type": 3,
        # }
        print(trajectory[i])
        observation, reward, done, info = env.step(trajectory[i], pause=5)
        print("Observation:", observation)
        print("Reward:", reward)
        print("Info:", info)
@@ -61,8 +56,12 @@ def human_agent():
            print("The episode is done.")
            break
    result = env.evaluate()
    print("Result:", result)
    env.close()
    print("Environment closed.")
 if __name__ == "__main__":
    human_agent()
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,8 +1,12 @@
 # fixme: Need to be rewrite on new action space
 import os
 import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
 import requests
 from mm_agents.gpt_4v_prompt import SYS_PROMPT
 # Function to encode the image
@@ -11,6 +15,38 @@ def encode_image(image_path):
        return base64.b64encode(image_file.read()).decode('utf-8')
 def parse_actions_from_string(input_string):
    # Search for a JSON string within the input string
    actions = []
    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
    if matches:
        # Assuming there's only one match, parse the JSON string into a dictionary
        try:
            for match in matches:
                action_dict = json.loads(match)
                actions.append(action_dict)
            return actions
        except json.JSONDecodeError as e:
            return f"Failed to parse JSON: {e}"
    else:
        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
        if matches:
            # Assuming there's only one match, parse the JSON string into a dictionary
            try:
                for match in matches:
                    action_dict = json.loads(match)
                    actions.append(action_dict)
                return actions
            except json.JSONDecodeError as e:
                return f"Failed to parse JSON: {e}"
        else:
            try:
                action_dict = json.loads(input_string)
                return [action_dict]
            except json.JSONDecodeError as e:
                raise ValueError("Invalid response format: " + input_string)
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -22,18 +58,13 @@ class GPT4v_Agent:
            "Authorization": f"Bearer {api_key}"
        }
        # load prompt from file
        self.prompt = ""
        with open("gpt_4v_prompt.txt", "r") as f:
            self.prompt = f.read()
        self.trajectory = [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
-                        "text": self.prompt
+                        "text": SYS_PROMPT
                    },
                ]
            }
@@ -56,6 +87,12 @@ class GPT4v_Agent:
                }
            ]
        })
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
            if len(self.trajectory[i]["content"]) > 1:
                traj_to_show.append("screenshot_obs")
        print("Trajectory:", traj_to_show)
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -63,11 +100,15 @@ class GPT4v_Agent:
        }
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
-        action = self.parse_action(response.json()['choices'][0]['message']['content'])
+        try:
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
        except:
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None
-        return action
+        return actions
-    def parse_action(self, response: str):
+    def parse_actions(self, response: str):
        # response example
        """
        ```json
@@ -79,12 +120,7 @@ class GPT4v_Agent:
        """
        # parse from the response
-        if response.startswith("```json"):
+        actions = parse_actions_from_string(response)
            action = json5.loads(response[7:-3])
        elif response.startswith("```"):
            action = json5.loads(response[3:-3])
        else:
            action = json5.loads(response)
        # add action into the trajectory
        self.trajectory.append({
@@ -98,25 +134,28 @@ class GPT4v_Agent:
        })
        # parse action
-        parsed_action = {}
+        parsed_actions = []
-        action_type = Action[action['action_type']].value
+        for action in actions:
-        parsed_action["action_type"] = action_type
+            parsed_action = {}
            action_type = Action[action['action_type']].value
            parsed_action["action_type"] = action_type
-        if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
+            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-            parsed_action["click_type"] = MouseClick[action['click_type']].value
+                parsed_action["click_type"] = MouseClick[action['click_type']].value
-        if action_type == Action.MOUSE_MOVE.value:
+            if action_type == Action.MOUSE_MOVE.value:
-            parsed_action["x"] = action["x"]
+                parsed_action["x"] = action["x"]
-            parsed_action["y"] = action["y"]
+                parsed_action["y"] = action["y"]
-        # fixme: could these two actions be merged??
+            if action_type == Action.KEY.value:
-        if action_type == Action.KEY.value:
+                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys
            parsed_action["key"] = [ord(c) for c in action["key"]]
-        if action_type == Action.TYPE.value:
+            if action_type == Action.TYPE.value:
-            parsed_action["text"] = [ord(c) for c in action["text"]]
+                parsed_action["text"] = action["text"]
-        return parsed_action
+            parsed_actions.append(parsed_action)
        return parsed_actions
 if __name__ == '__main__':
@@ -125,4 +164,3 @@ if __name__ == '__main__':
    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
    print(agent.predict(obs="stackoverflow.png"))
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -0,0 +1,54 @@
 SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
 Firstly you need to predict the class of your action, select from one below:
 - **MOUSE_MOVE**: move the mouse to a specific position
 - **CLICK**: click on the screen
 - **MOUSE_DOWN**: press the mouse button
 - **MOUSE_UP**: release the mouse button
 - **KEY**: press a key on the keyboard
 - **KEY_DOWN**: press a key on the keyboard
 - **KEY_UP**: release a key on the keyboard
 - **TYPE**: type a string on the keyboard
 Then you need to predict the parameters of your action:
 - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
 {
  "action_type": "MOUSE_MOVE",
  "x": 1319.11,
  "y": 65.06
 }
 ```
 - For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
 for example, format as:
 ```
 {
  "action_type": "CLICK",
  "click_type": "LEFT"
 }
 ```
 - For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
 for example, format as:
 ```
 {
  "action_type": "KEY",
  "key": "ctrl+c"
 }
 ```
 - For TYPE, you need to specify the text you want to type
 for example, format as:
 ```
 {
  "action_type": "TYPE",
  "text": "hello world"
 }
 ```
 For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
 You can predict multiple actions at one step, but you should only return one action for each step.
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 """
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -0,0 +1,8 @@
 SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
 Return `None` if you cannot perform the action.
 """
--- a/mm_agents/sam_test.py
+++ b/mm_agents/sam_test.py
@@ -0,0 +1,124 @@
 import torch
 from PIL import Image
 import requests
 from transformers import SamModel, SamProcessor
 import numpy as np
 import matplotlib.pyplot as plt
 import os
 os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
 def show_mask(mask, ax, random_color=False):
    if random_color:
        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
    else:
        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
    h, w = mask.shape[-2:]
    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
    ax.imshow(mask_image)
 def show_box(box, ax):
    x0, y0 = box[0], box[1]
    w, h = box[2] - box[0], box[3] - box[1]
    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
 def show_boxes_on_image(raw_image, boxes):
    plt.figure(figsize=(10, 10))
    plt.imshow(raw_image)
    for box in boxes:
        show_box(box, plt.gca())
    plt.axis('on')
    plt.show()
 def show_points_on_image(raw_image, input_points, input_labels=None):
    plt.figure(figsize=(10, 10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
        labels = np.ones_like(input_points[:, 0])
    else:
        labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    plt.axis('on')
    plt.show()
 def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
    plt.figure(figsize=(10, 10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
        labels = np.ones_like(input_points[:, 0])
    else:
        labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    for box in boxes:
        show_box(box, plt.gca())
    plt.axis('on')
    plt.show()
 def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
    plt.figure(figsize=(10, 10))
    plt.imshow(raw_image)
    input_points = np.array(input_points)
    if input_labels is None:
        labels = np.ones_like(input_points[:, 0])
    else:
        labels = np.array(input_labels)
    show_points(input_points, labels, plt.gca())
    for box in boxes:
        show_box(box, plt.gca())
    plt.axis('on')
    plt.show()
 def show_points(coords, labels, ax, marker_size=375):
    pos_points = coords[labels == 1]
    neg_points = coords[labels == 0]
    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)
    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
               linewidth=1.25)
 def show_masks_on_image(raw_image, masks, scores):
    if len(masks.shape) == 4:
        masks = masks.squeeze()
    if scores.shape[0] == 1:
        scores = scores.squeeze()
    nb_predictions = scores.shape[-1]
    fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
    for i, (mask, score) in enumerate(zip(masks, scores)):
        mask = mask.cpu().detach()
        axes[i].imshow(np.array(raw_image))
        show_mask(mask, axes[i])
        axes[i].title.set_text(f"Mask {i + 1}, Score: {score.item():.3f}")
        axes[i].axis("off")
    plt.show()
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
 processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
 img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
 raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
 plt.imshow(raw_image)
 inputs = processor(raw_image, return_tensors="pt").to(device)
 with torch.no_grad():
    outputs = model(**inputs)
 masks = processor.image_processor.post_process_masks(
    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
 )
 scores = outputs.iou_scores
 show_masks_on_image(raw_image, masks[0], scores)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,16 @@
-numpy
+numpy~=1.24.3
-Pillow
+Pillow~=10.1.0
 fabric
-gymnasium
+gymnasium~=0.28.1
-requests
+requests~=2.31.0
-transformers
+transformers~=4.35.2
-torch
+torch~=2.1.1+cu118
 accelerate
 opencv-python~=4.8.1.78
 matplotlib~=3.7.4
 pynput~=1.7.6
 pyautogui~=0.9.54
 psutil~=5.9.6
 tqdm~=4.65.0
 pandas~=2.0.3
 flask~=3.0.0
--- a/screenshot.png
+++ b/screenshot.png
--- a/utils/complex_clicking.json
+++ b/utils/complex_clicking.json
--- a/utils/complex_clicking.jsonl
+++ b/utils/complex_clicking.jsonl
--- a/utils/ducktrack.py
+++ b/utils/ducktrack.py
@@ -3,76 +3,97 @@ import sys, pathlib;
 sys.path.append(str(pathlib.Path(__file__).parents[1]))
 import os
 import math
 import json
 import numpy as np
 from typing import List
-from desktop_env.envs.desktop_env import Action, MouseClick
+from copy import deepcopy
 pynput2pyautogui_key = {
    "alt_l": "altleft",
    "alt_r": "altright",
 }
 COMMAND_KEYS = ['accept', 'add', 'alt', 'altleft', 'altright', 'apps', 'backspace', 'browserback', 'browserfavorites', 'browserforward', 'browserhome', 'browserrefresh', 'browsersearch', 'browserstop', 'capslock', 'clear', 'convert', 'ctrl', 'ctrlleft', 'ctrlright', 'decimal', 'del', 'delete', 'divide', 'down', 'end', 'enter', 'esc', 'escape', 'execute', 'f1', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f2', 'f20', 'f21', 'f22', 'f23', 'f24', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'final', 'fn', 'hanguel', 'hangul', 'hanja', 'help', 'home', 'insert', 'junja', 'kana', 'kanji', 'launchapp1', 'launchapp2', 'launchmail', 'launchmediaselect', 'left', 'modechange', 'multiply', 'nexttrack', 'nonconvert', 'num0', 'num1', 'num2', 'num3', 'num4', 'num5', 'num6', 'num7', 'num8', 'num9', 'numlock', 'pagedown', 'pageup', 'pause', 'pgdn', 'pgup', 'playpause', 'prevtrack', 'print', 'printscreen', 'prntscrn', 'prtsc', 'prtscr', 'return', 'right', 'scrolllock', 'select', 'separator', 'shift', 'shiftleft', 'shiftright', 'sleep', 'stop', 'subtract', 'tab', 'up', 'volumedown', 'volumemute', 'volumeup', 'win', 'winleft', 'winright', 'yen', 'command', 'option', 'optionleft', 'optionright', 'alt_l', 'alt_r']
 typingkey2str = {
    "space" : " ",
 }
 class DuckTrackEventActionConverter:
-    def __init__(self, human_readable: str, compress_move: bool = True):
+    def __init__(self, ):
-        self.human_readable = human_readable
+        """"""
        self.compress_move = compress_move
-    def enum_to_str(self, enum):
+    ### Enumerations ###
-        """Converts an enum to its string representation if HUMAN_READABLE is True, otherwise returns its value."""
+    def move_event_to_action(self, event: dict, action_space: str = "computer_13"):
-        return enum.name if self.human_readable else enum.value
+        """Converts a mouse move event to its corresponding action."""
        if action_space == "computer_13":
            return {
                "action_type": "MOVE_TO",
                "parameters": {
                    "x": event["x"],
                    "y": event["y"]
                }
            }
        elif action_space == "pyautogui":
            return "pyautogui.moveTo({}, {})".format(event["x"], event["y"])
-    def compress_mouse_move(self, data: List[dict], index: int):
+    def click_event_to_action(self, event: dict, action_space: str = "computer_13"):
-        """Compresses consecutive mouse move events into first and last move events."""
+        """Converts a mouse click event to its corresponding action."""
-        first_move, last_move = data[index], data[index]
+        action = {
-        while index < len(data) and data[index]["action"] == "move":
+            "action_type": None,
-            last_move = data[index]
+            "parameters": {
-            index += 1
+                "button": None
-        return first_move, last_move, index
+            }
        }
    def move_event_to_action(self, event: dict):
        return {"action_type": self.enum_to_str(Action.MOUSE_MOVE),
                "x": event["x"],
                "y": event["y"]}
    def click_event_to_action(self, event: dict):
        action = {}
        mouse_button = event["button"]
        mouse_pressed = event["pressed"]
        if mouse_pressed:
-            action["action_type"] = self.enum_to_str(Action.MOUSE_DOWN)
+            action["action_type"] = "MOUSE_DOWN"
        elif not mouse_pressed:
-            action["action_type"] = self.enum_to_str(Action.MOUSE_UP)
+            action["action_type"] = "MOUSE_UP"
        else:
            raise NotImplementedError(mouse_pressed)
-        if mouse_button == "left":
+        if mouse_button in ["left", "right", "middle"]:
-            action["click_type"] = self.enum_to_str(MouseClick.LEFT)
+            action["parameters"]["button"] = mouse_button
        elif mouse_button == "right":
            action["click_type"] = self.enum_to_str(MouseClick.RIGHT)
        elif mouse_button == "middle":
            action["click_type"] = self.enum_to_str(MouseClick.MIDDLE)
        else:
            raise NotImplementedError(mouse_button)
        return action
-    def press_event_to_action(self, event: dict):
+    def press_event_to_action(self, event: dict, action_space: str = "computer_13"):
-        return {"action_type": self.enum_to_str(Action.KEY_DOWN),
+        """Converts a key down event to its corresponding action."""
-                "key": [ord(c) for c in event["name"]]}
+        # NOTE: the `key down`, `press` have the same meaning here, while different in pyautogui
        return {
            "action_type": "KEY_DOWN",
            "parameters": {
                "key": event["name"] if event["name"] not in pynput2pyautogui_key else pynput2pyautogui_key[
                    event["name"]]
            }
        }
-    def release_event_to_action(self, event: dict):
+    def release_event_to_action(self, event: dict, action_space: str = "computer_13"):
-        return {"action_type": self.enum_to_str(Action.KEY_UP),
+        """Converts a key release event to its corresponding action."""
-                "key": [ord(c) for c in event["name"]]}
+        return {
            "action_type": "KEY_UP",
            "parameters": {
                "key": event["name"] if event["name"] not in pynput2pyautogui_key else pynput2pyautogui_key[
                    event["name"]]
            }
        }
-    def scroll_event_to_action(self, event: dict):
+    def scroll_event_to_action(self, event: dict, action_space: str = "computer_13"):
-        # TODO: need to confirm if df < 0 means scroll up or down
+        """Converts a scroll event to its corresponding action."""
-        if event["dy"] < 0:
+        return {
-            down = False
+            "action_type": "SCROLL",
-        else:
+            "parameters": {
-            down = True
+                "dx": event["dx"],
                "dy": event["dy"]
            }
        }
-        return {"action_type": self.enum_to_str(Action.CLICK),
+    def event_to_action(self, event: dict, action_space: str = "computer_13"):
                "click_type": self.enum_to_str(MouseClick.WHEEL_DOWN) if down else self.enum_to_str(
                    MouseClick.WHEEL_UP)}
    def event_to_action(self, event: dict):
        """Converts an event to its corresponding action based on the event type."""
        if event["action"] == "move":
            return self.move_event_to_action(event)
@@ -87,114 +108,243 @@ class DuckTrackEventActionConverter:
        else:
            raise NotImplementedError(event["action"])
-    def ducktrack_event_file_to_action(self, ducktrack_event_file: str, out_file: str, compress_move: bool = None):
+    ### Compressing ###
    def compress_mouse_move(self, data: List[dict], index: int):
        """Compresses consecutive mouse move events into the last move events."""
        last_move = data[index]
        while index < len(data) and data[index]["action"] == "move":
            last_move = data[index]
            index += 1
        return last_move, index
    def compress_scroll(self, data: List[dict], index: int):
        """Compresses consecutive scroll events into a single scroll event."""
        last_scroll = data[index]
        consecutive_dx, consecutive_dy = data[index]["dx"], data[index]["dy"]
        while index < len(data) and data[index]["action"] == "scroll" and np.sign(data[index]["dx"]) == np.sign(consecutive_dx) and np.sign(data[index]["dy"]) == np.sign(consecutive_dy):
            last_scroll = data[index]
            consecutive_dx += data[index]["dx"]
            consecutive_dy += data[index]["dy"]
            index += 1
        last_scroll["dx"], last_scroll["dy"] = consecutive_dx, consecutive_dy
        return last_scroll, index
    ### Converting ###
    def ducktrack_event_file_to_action(self, ducktrack_event_file: str, out_file: str, compress_move: bool = True, compress_scroll: bool = True, compress_click: bool = True,compress_drag: bool = True, compress_press_key: bool = True, compress_typing: bool = True):
        """Converts DuckTrack event data to a list of actions and saves them to a file."""
        if not os.path.exists(ducktrack_event_file):
            raise FileNotFoundError(ducktrack_event_file)
        # set to default
        if compress_move is None:
            compress_move = self.compress_move
        with open(ducktrack_event_file, 'r') as file:
-            data = [json.loads(line) for line in file]
+            events = [json.loads(line) for line in file]
-        result = {"action": [], "event": []}
+        # Save the compressed actions in a list
        result = []
        index = 0
        presses_to_skip = 0
        releases_to_skip = 0
        move_to_skip = 0
        keys_pressed = []
        # Compress the mouse move events
-        while index < len(data):
+        while index < len(events):
-            event = data[index]
+
-            if event["action"] == "move" and compress_move:
+            event = events[index]            
-                first_move, last_move, index = self.compress_mouse_move(data, index)
+
-                result["action"].extend([self.event_to_action(last_move)])
+            def do_mouse_press(button: str, _index: int):
-                result["event"].extend([last_move])
+
-            else:
+                num_clicks = 0
-                result["action"].append(self.event_to_action(event))
+                mouse_pressed = True
-                result["event"].append(event)
+                skip_move = 0
                click_x, click_y = event["x"], event["y"]
                for j, next_event in enumerate(events[index + 1:]):
                    # make sure the time between mouse clicks is less than 500ms
                    if next_event["time_stamp"] - event["time_stamp"] > 0.5:
                        if num_clicks > 0:
                            if result[-1:][0]["action_type"] == "MOVE_TO":
                                result.pop()
                            result.append({
                                "action_type": "CLICK",
                                "parameters": {
                                    "button": button,
                                    "x" : click_x,
                                    "y" : click_y,
                                    "num_clicks": num_clicks
                                }
                            })
                            return num_clicks-1, num_clicks, _index, skip_move 
                        break
                    if "x" in next_event and "y" in next_event:
                        # if the mouse moves out of the click radius/rectangle, it is not a click sequence
                        if math.sqrt((next_event["y"] - event["y"]) ** 2 +
                                     (next_event["x"] - event["x"]) ** 2) > 4:
                            if num_clicks > 0:
                                if result[-1:][0]["action_type"] == "MOVE_TO":
                                    result.pop()
                                result.append({
                                    "action_type": "CLICK",
                                    "parameters": {
                                        "button": button,
                                        "x" : click_x,
                                        "y" : click_y,
                                        "num_clicks": num_clicks
                                    }
                                })
                                return num_clicks-1, num_clicks, _index, skip_move 
                            break
                    if next_event["action"] == "click"  and compress_click:
                        if not next_event["pressed"]:
                            num_clicks += 1
                            mouse_pressed = False
                            if num_clicks == 3:
                                if result[-1:][0]["action_type"] == "MOVE_TO":
                                    result.pop()
                                result.append({
                                        "action_type": "CLICK",
                                        "parameters": {
                                            "button": button,
                                            "x" : click_x,
                                            "y" : click_y,
                                            "num_clicks": 3
                                        }
                                    })
                                return 2, 3, _index, skip_move
                        elif next_event["pressed"]:
                            mouse_pressed = True
                        else:
                            raise NotImplementedError(next_event["pressed"])
                    elif next_event["action"] != "click" and not mouse_pressed:
                        if next_event["action"] == "move":
                            if next_event["x"] == click_x and next_event["y"] == click_y:
                                skip_move += 1
                                continue
                        if result[-1:][0]["action_type"] == "MOVE_TO":
                            result.pop()
                        result.append({
                            "action_type": "CLICK",
                            "parameters": {
                                "button": button,
                                "x" : click_x,
                                "y" : click_y,
                                "num_clicks": num_clicks
                            }
                        })
                        return num_clicks-1, num_clicks, _index, skip_move                      
                    # Compress {MOUSE_DOWN, MOVE, MOUSE_UP} into DRAG_TO event    
                    elif next_event["action"] == "move" and compress_drag:
                        if next_event["x"] == click_x and next_event["y"] == click_y:
                            skip_move += 1
                            continue
                        last_move, _index = self.compress_mouse_move(events, _index+1)
                        result.append({
                            "action_type": "DRAG_TO",
                            "parameters": {
                                "x": last_move["x"],
                                "y": last_move["y"]
                            }
                        })
                        return 0, 1, _index, skip_move           
                result.append({
                    "action_type": "MOUSE_DOWN",
                    "parameters": {
                        "button": button
                    }
                })
                return 0, 0, _index, skip_move
            if event["action"] == "move":
                if move_to_skip > 0:
                    move_to_skip -= 1
                    index += 1
                    continue
                if compress_move:
                    last_move, index = self.compress_mouse_move(events, index)
                    result.extend([self.event_to_action(last_move)])
            elif event["action"] == "scroll" and compress_scroll:
                last_scroll, index = self.compress_scroll(events, index)
                result.extend([self.event_to_action(last_scroll)])
            elif event["action"] == "click":
                button = event["button"]
                if event["pressed"]:
                    if presses_to_skip == 0:
                        presses, releases, index, moves = do_mouse_press(button, index)
                        presses_to_skip += presses
                        releases_to_skip += releases
                        move_to_skip += moves
                    else:
                        presses_to_skip -= 1
                else:
                    if releases_to_skip == 0:
                        result.append({
                            "action_type": "MOUSE_UP",
                            "parameters": {
                                "button": button
                            }
                        })
                    else:
                        releases_to_skip -= 1
                index += 1
-
+            elif event["action"] == "press" and event["name"] not in COMMAND_KEYS and compress_typing:
-        # Compress the key down and key up actions
+                typing_words = ""
-        # todo: handling the key down and key up events
+                while index < len(events) and events[index]["action"] in ["press", "release"] and events[index]["name"] not in COMMAND_KEYS:
-        _new_actions = []
+                    if events[index]["action"] == "press":
-        _action = list(result["action"])
+                        keys_pressed.append(events[index]["name"])
-        idx = 0
+                        typing_words += events[index]["name"] if events[index]["name"] not in typingkey2str else typingkey2str[events[index]["name"]]
-
+                    elif events[index]["action"] == "release":
-        while True:
+                        keys_pressed.remove(events[index]["name"])
-            if idx >= len(_action):
+                    index += 1
-                break
+                if len(typing_words) > 1:
-
+                    result.append({
-            if _action[idx]["action_type"] == self.enum_to_str(Action.KEY_DOWN):
+                        "action_type": "TYPING",
-                typed_text = []
+                        "parameters": {
-                while idx < len(_action) and _action[idx]["action_type"] in [self.enum_to_str(Action.KEY_DOWN), self.enum_to_str(Action.KEY_UP)] and len(_action[idx]["key"]) == 1:
+                            "text": typing_words
-                    if _action[idx]["action_type"] == self.enum_to_str(Action.KEY_DOWN):
+                        }
-                        typed_text.append(chr(_action[idx]["key"][0]))
+                    })
                    idx += 1
                if typed_text:
                    _new_actions.append({"action_type": self.enum_to_str(Action.TYPE), "text": typed_text})
                else:
-                    _new_actions.append(_action[idx])
+                    result.append({
-                    idx += 1
+                        "action_type": "PRESS",
                        "parameters": {
                            "key": typing_words
                        }
                    })
            elif event["action"] == "press" and compress_press_key:
                keys_pressed.append(event["name"])
                result.append({
                            "action_type": "PRESS",
                            "parameters": {
                                "key": event["name"] if event["name"] not in pynput2pyautogui_key else pynput2pyautogui_key[
                                    event["name"]]
                            }
                        })
                index += 1
            elif event["action"] == "release" and compress_press_key:
                keys_pressed.remove(event["name"])
                index += 1
            else:
-                _new_actions.append(_action[idx])
+                result.append(self.event_to_action(event))
-                idx += 1
+                index += 1
        result["action"] = _new_actions
        # Compress the scroll up and scroll down events
        # todo: handling the key down and key up events
        _new_actions = []
        _action = list(result["action"])
        idx = 0
        while True:
            if idx >= len(_action):
                break
            if _action[idx]["action_type"] == self.enum_to_str(Action.CLICK) and _action[idx]["click_type"] in [self.enum_to_str(MouseClick.WHEEL_UP), self.enum_to_str(MouseClick.WHEEL_DOWN)]:
                typed_text = []
                while idx < len(_action) and _action[idx]["action_type"] == self.enum_to_str(Action.CLICK) and _action[idx]["click_type"] in [self.enum_to_str(MouseClick.WHEEL_UP), self.enum_to_str(MouseClick.WHEEL_DOWN)]:
                    if _action[idx]["click_type"] == self.enum_to_str(MouseClick.WHEEL_UP):
                        typed_text.append("UP")
                        idx += 1
                    elif _action[idx]["click_type"] == self.enum_to_str(MouseClick.WHEEL_DOWN):
                        typed_text.append("DOWN")
                        idx += 1
                _new_actions.append({"action_type": self.enum_to_str(Action.CLICK), "click_type": "SCROLL", "text": typed_text})
            else:
                _new_actions.append(_action[idx])
                idx += 1
        result["action"] = _new_actions
        # Compress the mouse down and mouse up actions
        # todo: handling the key down and key up events
        _new_actions = []
        _action = list(result["action"])
        idx = 0
        while True:
            if idx >= len(_action):
                break
            if _action[idx]["action_type"] == self.enum_to_str(Action.MOUSE_DOWN):
                if idx + 1 < len(_action) and _action[idx+1]["action_type"] == self.enum_to_str(Action.MOUSE_UP):
                    _new_actions.append({"action_type": self.enum_to_str(Action.CLICK), "click_type": _action[idx]["click_type"]})
                    idx += 2
                else:
                    _new_actions.append(_action[idx])
                    idx += 1
            else:
                _new_actions.append(_action[idx])
                idx += 1
        result["action"] = _new_actions
        with open(out_file, "w") as f:
            json.dump(result, f)
 if __name__ == "__main__":
-    converter = DuckTrackEventActionConverter(human_readable=True)
+    converter = DuckTrackEventActionConverter()
-    converter.ducktrack_event_file_to_action(ducktrack_event_file="sample.jsonl",
+    converter.ducktrack_event_file_to_action(
-                                             out_file="output.json",
+        ducktrack_event_file="complex_clicking.jsonl",
-                                             compress_move=True)
+                out_file="complex_clicking5.json",
        compress_move=True,
        compress_scroll=True,
        compress_click=True,
        compress_drag=True,
        compress_press_key=True,
        compress_typing=True,
    )
--- a/utils/events_calc.json
+++ b/utils/events_calc.json
@@ -0,0 +1,111 @@
 [
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 152,
      "y": 259
    }
  },
  {
    "action_type": "MOUSE_DOWN",
    "parameters": {
      "button": "left"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 464,
      "y": 317
    }
  },
  {
    "action_type": "MOUSE_UP",
    "parameters": {
      "button": "left"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 466,
      "y": 317
    }
  },
  {
    "action_type": "KEY_DOWN",
    "parameters": {
      "key": "altleft"
    }
  },
  {
    "action_type": "KEY_DOWN",
    "parameters": {
      "key": "="
    }
  },
  {
    "action_type": "KEY_UP",
    "parameters": {
      "key": "="
    }
  },
  {
    "action_type": "KEY_UP",
    "parameters": {
      "key": "altleft"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 709,
      "y": 1047
    }
  },
  {
    "action_type": "MOUSE_DOWN",
    "parameters": {
      "button": "left"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 709,
      "y": 1047
    }
  },
  {
    "action_type": "MOUSE_UP",
    "parameters": {
      "button": "left"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 717,
      "y": 304
    }
  },
  {
    "action_type": "MOUSE_DOWN",
    "parameters": {
      "button": "left"
    }
  },
  {
    "action_type": "MOVE_TO",
    "parameters": {
      "x": 717,
      "y": 304
    }
  },
  {
    "action_type": "MOUSE_UP",
    "parameters": {
      "button": "left"
    }
  }
 ]
--- a/utils/events_calc.jsonl
+++ b/utils/events_calc.jsonl
@@ -0,0 +1,423 @@
 {"time_stamp": 21028.2899763, "action": "move", "x": 686, "y": 306}
 {"time_stamp": 21028.2965794, "action": "move", "x": 684, "y": 306}
 {"time_stamp": 21028.3046644, "action": "move", "x": 678, "y": 306}
 {"time_stamp": 21028.3126807, "action": "move", "x": 670, "y": 306}
 {"time_stamp": 21028.3208329, "action": "move", "x": 661, "y": 306}
 {"time_stamp": 21028.3288313, "action": "move", "x": 645, "y": 306}
 {"time_stamp": 21028.336626, "action": "move", "x": 625, "y": 306}
 {"time_stamp": 21028.3445457, "action": "move", "x": 603, "y": 305}
 {"time_stamp": 21028.3527487, "action": "move", "x": 574, "y": 303}
 {"time_stamp": 21028.3606394, "action": "move", "x": 544, "y": 301}
 {"time_stamp": 21028.3688565, "action": "move", "x": 508, "y": 300}
 {"time_stamp": 21028.3768381, "action": "move", "x": 471, "y": 298}
 {"time_stamp": 21028.3848709, "action": "move", "x": 430, "y": 296}
 {"time_stamp": 21028.3926563, "action": "move", "x": 389, "y": 296}
 {"time_stamp": 21028.4009164, "action": "move", "x": 348, "y": 296}
 {"time_stamp": 21028.4089388, "action": "move", "x": 313, "y": 296}
 {"time_stamp": 21028.4171707, "action": "move", "x": 280, "y": 296}
 {"time_stamp": 21028.4245847, "action": "move", "x": 252, "y": 294}
 {"time_stamp": 21028.4328148, "action": "move", "x": 225, "y": 294}
 {"time_stamp": 21028.4406678, "action": "move", "x": 208, "y": 294}
 {"time_stamp": 21028.4486998, "action": "move", "x": 192, "y": 294}
 {"time_stamp": 21028.4568529, "action": "move", "x": 177, "y": 294}
 {"time_stamp": 21028.4647334, "action": "move", "x": 163, "y": 293}
 {"time_stamp": 21028.4729702, "action": "move", "x": 153, "y": 293}
 {"time_stamp": 21028.4808044, "action": "move", "x": 143, "y": 293}
 {"time_stamp": 21028.4889062, "action": "move", "x": 135, "y": 293}
 {"time_stamp": 21028.4967676, "action": "move", "x": 130, "y": 293}
 {"time_stamp": 21028.5050544, "action": "move", "x": 124, "y": 293}
 {"time_stamp": 21028.5127317, "action": "move", "x": 120, "y": 293}
 {"time_stamp": 21028.520827, "action": "move", "x": 117, "y": 293}
 {"time_stamp": 21028.5289378, "action": "move", "x": 114, "y": 293}
 {"time_stamp": 21028.5371078, "action": "move", "x": 111, "y": 293}
 {"time_stamp": 21028.545514, "action": "move", "x": 107, "y": 293}
 {"time_stamp": 21028.5527022, "action": "move", "x": 104, "y": 292}
 {"time_stamp": 21028.5605384, "action": "move", "x": 100, "y": 292}
 {"time_stamp": 21028.5686583, "action": "move", "x": 96, "y": 291}
 {"time_stamp": 21028.5766951, "action": "move", "x": 90, "y": 291}
 {"time_stamp": 21028.5847502, "action": "move", "x": 85, "y": 291}
 {"time_stamp": 21028.5926223, "action": "move", "x": 79, "y": 290}
 {"time_stamp": 21028.6007454, "action": "move", "x": 74, "y": 290}
 {"time_stamp": 21028.6088707, "action": "move", "x": 70, "y": 289}
 {"time_stamp": 21028.6166501, "action": "move", "x": 67, "y": 289}
 {"time_stamp": 21028.6249259, "action": "move", "x": 66, "y": 289}
 {"time_stamp": 21028.6647889, "action": "move", "x": 66, "y": 289}
 {"time_stamp": 21028.6728642, "action": "move", "x": 68, "y": 288}
 {"time_stamp": 21028.6807781, "action": "move", "x": 70, "y": 286}
 {"time_stamp": 21028.6888295, "action": "move", "x": 74, "y": 285}
 {"time_stamp": 21028.6971027, "action": "move", "x": 77, "y": 284}
 {"time_stamp": 21028.7046499, "action": "move", "x": 81, "y": 282}
 {"time_stamp": 21028.7129405, "action": "move", "x": 86, "y": 281}
 {"time_stamp": 21028.7205325, "action": "move", "x": 91, "y": 279}
 {"time_stamp": 21028.7285422, "action": "move", "x": 98, "y": 278}
 {"time_stamp": 21028.7366509, "action": "move", "x": 104, "y": 275}
 {"time_stamp": 21028.7448279, "action": "move", "x": 110, "y": 275}
 {"time_stamp": 21028.7527897, "action": "move", "x": 116, "y": 273}
 {"time_stamp": 21028.7609718, "action": "move", "x": 120, "y": 272}
 {"time_stamp": 21028.7688693, "action": "move", "x": 124, "y": 271}
 {"time_stamp": 21028.7766846, "action": "move", "x": 128, "y": 270}
 {"time_stamp": 21028.7848371, "action": "move", "x": 131, "y": 270}
 {"time_stamp": 21028.7927773, "action": "move", "x": 133, "y": 268}
 {"time_stamp": 21028.8007498, "action": "move", "x": 134, "y": 268}
 {"time_stamp": 21028.8088143, "action": "move", "x": 136, "y": 268}
 {"time_stamp": 21028.8168157, "action": "move", "x": 137, "y": 268}
 {"time_stamp": 21028.8246469, "action": "move", "x": 139, "y": 268}
 {"time_stamp": 21028.8327817, "action": "move", "x": 140, "y": 268}
 {"time_stamp": 21028.8408239, "action": "move", "x": 141, "y": 268}
 {"time_stamp": 21028.8488115, "action": "move", "x": 142, "y": 267}
 {"time_stamp": 21028.8571578, "action": "move", "x": 143, "y": 267}
 {"time_stamp": 21028.8646641, "action": "move", "x": 144, "y": 267}
 {"time_stamp": 21028.8741985, "action": "move", "x": 145, "y": 267}
 {"time_stamp": 21028.8809717, "action": "move", "x": 146, "y": 267}
 {"time_stamp": 21028.8888646, "action": "move", "x": 146, "y": 267}
 {"time_stamp": 21028.961049, "action": "move", "x": 146, "y": 266}
 {"time_stamp": 21029.0249854, "action": "move", "x": 147, "y": 265}
 {"time_stamp": 21029.0328138, "action": "move", "x": 147, "y": 264}
 {"time_stamp": 21029.0407582, "action": "move", "x": 147, "y": 264}
 {"time_stamp": 21029.0487772, "action": "move", "x": 148, "y": 263}
 {"time_stamp": 21029.0569372, "action": "move", "x": 148, "y": 263}
 {"time_stamp": 21029.065073, "action": "move", "x": 149, "y": 262}
 {"time_stamp": 21029.0729933, "action": "move", "x": 150, "y": 262}
 {"time_stamp": 21029.0888149, "action": "move", "x": 150, "y": 261}
 {"time_stamp": 21029.0971595, "action": "move", "x": 151, "y": 260}
 {"time_stamp": 21029.10458, "action": "move", "x": 151, "y": 260}
 {"time_stamp": 21029.1126284, "action": "move", "x": 151, "y": 260}
 {"time_stamp": 21029.1208764, "action": "move", "x": 151, "y": 259}
 {"time_stamp": 21029.1287413, "action": "move", "x": 152, "y": 259}
 {"time_stamp": 21029.1611214, "action": "move", "x": 152, "y": 259}
 {"time_stamp": 21029.1614723, "action": "click", "x": 152, "y": 259, "button": "left", "pressed": true}
 {"time_stamp": 21029.2168134, "action": "move", "x": 152, "y": 259}
 {"time_stamp": 21029.2248681, "action": "move", "x": 154, "y": 259}
 {"time_stamp": 21029.2327317, "action": "move", "x": 156, "y": 260}
 {"time_stamp": 21029.2408222, "action": "move", "x": 158, "y": 262}
 {"time_stamp": 21029.2487515, "action": "move", "x": 163, "y": 263}
 {"time_stamp": 21029.2568152, "action": "move", "x": 169, "y": 266}
 {"time_stamp": 21029.2649126, "action": "move", "x": 174, "y": 270}
 {"time_stamp": 21029.2727425, "action": "move", "x": 183, "y": 273}
 {"time_stamp": 21029.2807226, "action": "move", "x": 190, "y": 276}
 {"time_stamp": 21029.2887741, "action": "move", "x": 200, "y": 279}
 {"time_stamp": 21029.296883, "action": "move", "x": 209, "y": 282}
 {"time_stamp": 21029.304834, "action": "move", "x": 220, "y": 285}
 {"time_stamp": 21029.3131548, "action": "move", "x": 233, "y": 287}
 {"time_stamp": 21029.3207916, "action": "move", "x": 244, "y": 290}
 {"time_stamp": 21029.3290871, "action": "move", "x": 256, "y": 292}
 {"time_stamp": 21029.3366508, "action": "move", "x": 268, "y": 293}
 {"time_stamp": 21029.3445108, "action": "move", "x": 279, "y": 294}
 {"time_stamp": 21029.3529213, "action": "move", "x": 288, "y": 297}
 {"time_stamp": 21029.3607282, "action": "move", "x": 298, "y": 297}
 {"time_stamp": 21029.3691604, "action": "move", "x": 307, "y": 297}
 {"time_stamp": 21029.3769931, "action": "move", "x": 316, "y": 298}
 {"time_stamp": 21029.3850192, "action": "move", "x": 324, "y": 300}
 {"time_stamp": 21029.3927881, "action": "move", "x": 331, "y": 301}
 {"time_stamp": 21029.4007925, "action": "move", "x": 336, "y": 302}
 {"time_stamp": 21029.4088638, "action": "move", "x": 342, "y": 304}
 {"time_stamp": 21029.4167924, "action": "move", "x": 346, "y": 304}
 {"time_stamp": 21029.4251047, "action": "move", "x": 349, "y": 304}
 {"time_stamp": 21029.4328699, "action": "move", "x": 352, "y": 306}
 {"time_stamp": 21029.4409293, "action": "move", "x": 355, "y": 306}
 {"time_stamp": 21029.4487136, "action": "move", "x": 356, "y": 307}
 {"time_stamp": 21029.4568755, "action": "move", "x": 358, "y": 308}
 {"time_stamp": 21029.4647053, "action": "move", "x": 361, "y": 309}
 {"time_stamp": 21029.4728173, "action": "move", "x": 363, "y": 310}
 {"time_stamp": 21029.4806011, "action": "move", "x": 365, "y": 311}
 {"time_stamp": 21029.4889321, "action": "move", "x": 367, "y": 312}
 {"time_stamp": 21029.4967544, "action": "move", "x": 370, "y": 313}
 {"time_stamp": 21029.5049087, "action": "move", "x": 374, "y": 314}
 {"time_stamp": 21029.5129759, "action": "move", "x": 377, "y": 316}
 {"time_stamp": 21029.5210278, "action": "move", "x": 381, "y": 317}
 {"time_stamp": 21029.5286154, "action": "move", "x": 386, "y": 317}
 {"time_stamp": 21029.5371491, "action": "move", "x": 390, "y": 318}
 {"time_stamp": 21029.5449815, "action": "move", "x": 393, "y": 319}
 {"time_stamp": 21029.5526305, "action": "move", "x": 397, "y": 319}
 {"time_stamp": 21029.5604721, "action": "move", "x": 400, "y": 319}
 {"time_stamp": 21029.5690371, "action": "move", "x": 402, "y": 319}
 {"time_stamp": 21029.5772927, "action": "move", "x": 405, "y": 319}
 {"time_stamp": 21029.5846161, "action": "move", "x": 406, "y": 319}
 {"time_stamp": 21029.5928399, "action": "move", "x": 407, "y": 319}
 {"time_stamp": 21029.6007032, "action": "move", "x": 408, "y": 319}
 {"time_stamp": 21029.609118, "action": "move", "x": 409, "y": 319}
 {"time_stamp": 21029.6166036, "action": "move", "x": 411, "y": 320}
 {"time_stamp": 21029.6249215, "action": "move", "x": 412, "y": 320}
 {"time_stamp": 21029.6327262, "action": "move", "x": 414, "y": 320}
 {"time_stamp": 21029.6408018, "action": "move", "x": 415, "y": 320}
 {"time_stamp": 21029.649463, "action": "move", "x": 418, "y": 320}
 {"time_stamp": 21029.6575693, "action": "move", "x": 420, "y": 320}
 {"time_stamp": 21029.6650956, "action": "move", "x": 423, "y": 320}
 {"time_stamp": 21029.6729346, "action": "move", "x": 426, "y": 320}
 {"time_stamp": 21029.6808747, "action": "move", "x": 429, "y": 320}
 {"time_stamp": 21029.688616, "action": "move", "x": 432, "y": 320}
 {"time_stamp": 21029.6970675, "action": "move", "x": 435, "y": 320}
 {"time_stamp": 21029.7049324, "action": "move", "x": 438, "y": 320}
 {"time_stamp": 21029.7130458, "action": "move", "x": 439, "y": 320}
 {"time_stamp": 21029.7207522, "action": "move", "x": 440, "y": 320}
 {"time_stamp": 21029.7289775, "action": "move", "x": 442, "y": 320}
 {"time_stamp": 21029.7366577, "action": "move", "x": 443, "y": 320}
 {"time_stamp": 21029.7444825, "action": "move", "x": 445, "y": 320}
 {"time_stamp": 21029.7526551, "action": "move", "x": 447, "y": 320}
 {"time_stamp": 21029.7604951, "action": "move", "x": 448, "y": 320}
 {"time_stamp": 21029.7686569, "action": "move", "x": 450, "y": 319}
 {"time_stamp": 21029.7775496, "action": "move", "x": 451, "y": 319}
 {"time_stamp": 21029.7849685, "action": "move", "x": 451, "y": 319}
 {"time_stamp": 21029.7929356, "action": "move", "x": 452, "y": 319}
 {"time_stamp": 21029.8007005, "action": "move", "x": 452, "y": 319}
 {"time_stamp": 21029.8170717, "action": "move", "x": 453, "y": 319}
 {"time_stamp": 21029.8248574, "action": "move", "x": 453, "y": 318}
 {"time_stamp": 21029.8330359, "action": "move", "x": 454, "y": 318}
 {"time_stamp": 21029.8407804, "action": "move", "x": 454, "y": 318}
 {"time_stamp": 21029.8487615, "action": "move", "x": 455, "y": 318}
 {"time_stamp": 21029.8648369, "action": "move", "x": 455, "y": 318}
 {"time_stamp": 21029.8726477, "action": "move", "x": 456, "y": 318}
 {"time_stamp": 21029.8809607, "action": "move", "x": 457, "y": 317}
 {"time_stamp": 21029.8888473, "action": "move", "x": 457, "y": 317}
 {"time_stamp": 21029.9048933, "action": "move", "x": 458, "y": 317}
 {"time_stamp": 21029.9129577, "action": "move", "x": 458, "y": 317}
 {"time_stamp": 21029.9208533, "action": "move", "x": 459, "y": 317}
 {"time_stamp": 21029.9286645, "action": "move", "x": 459, "y": 317}
 {"time_stamp": 21029.9368461, "action": "move", "x": 461, "y": 317}
 {"time_stamp": 21029.9448712, "action": "move", "x": 461, "y": 317}
 {"time_stamp": 21029.953212, "action": "move", "x": 462, "y": 317}
 {"time_stamp": 21029.9608238, "action": "move", "x": 463, "y": 317}
 {"time_stamp": 21029.9686821, "action": "move", "x": 463, "y": 317}
 {"time_stamp": 21029.9768342, "action": "move", "x": 464, "y": 317}
 {"time_stamp": 21030.361149, "action": "move", "x": 464, "y": 317}
 {"time_stamp": 21030.3613383, "action": "click", "x": 464, "y": 317, "button": "left", "pressed": false}
 {"time_stamp": 21030.9690893, "action": "move", "x": 465, "y": 317}
 {"time_stamp": 21030.9770331, "action": "move", "x": 465, "y": 317}
 {"time_stamp": 21030.9933165, "action": "move", "x": 466, "y": 317}
 {"time_stamp": 21031.8410512, "action": "press", "name": "alt_l"}
 {"time_stamp": 21032.1375784, "action": "press", "name": "="}
 {"time_stamp": 21032.2331653, "action": "release", "name": "="}
 {"time_stamp": 21032.4009051, "action": "release", "name": "alt_l"}
 {"time_stamp": 21033.1212821, "action": "move", "x": 466, "y": 317}
 {"time_stamp": 21033.1289659, "action": "move", "x": 467, "y": 320}
 {"time_stamp": 21033.1370348, "action": "move", "x": 471, "y": 325}
 {"time_stamp": 21033.1456134, "action": "move", "x": 475, "y": 332}
 {"time_stamp": 21033.1531721, "action": "move", "x": 482, "y": 340}
 {"time_stamp": 21033.1605014, "action": "move", "x": 490, "y": 349}
 {"time_stamp": 21033.1692663, "action": "move", "x": 498, "y": 359}
 {"time_stamp": 21033.1771117, "action": "move", "x": 508, "y": 371}
 {"time_stamp": 21033.1850449, "action": "move", "x": 521, "y": 383}
 {"time_stamp": 21033.1929826, "action": "move", "x": 535, "y": 399}
 {"time_stamp": 21033.201192, "action": "move", "x": 546, "y": 415}
 {"time_stamp": 21033.2089185, "action": "move", "x": 555, "y": 434}
 {"time_stamp": 21033.216848, "action": "move", "x": 563, "y": 452}
 {"time_stamp": 21033.2246769, "action": "move", "x": 570, "y": 469}
 {"time_stamp": 21033.2328685, "action": "move", "x": 574, "y": 485}
 {"time_stamp": 21033.2407514, "action": "move", "x": 577, "y": 503}
 {"time_stamp": 21033.2488102, "action": "move", "x": 578, "y": 518}
 {"time_stamp": 21033.2569003, "action": "move", "x": 578, "y": 534}
 {"time_stamp": 21033.2654896, "action": "move", "x": 580, "y": 552}
 {"time_stamp": 21033.2730147, "action": "move", "x": 580, "y": 571}
 {"time_stamp": 21033.2808888, "action": "move", "x": 582, "y": 592}
 {"time_stamp": 21033.2890461, "action": "move", "x": 583, "y": 617}
 {"time_stamp": 21033.2968868, "action": "move", "x": 586, "y": 643}
 {"time_stamp": 21033.3050093, "action": "move", "x": 588, "y": 665}
 {"time_stamp": 21033.3129685, "action": "move", "x": 591, "y": 694}
 {"time_stamp": 21033.3210515, "action": "move", "x": 592, "y": 716}
 {"time_stamp": 21033.3289082, "action": "move", "x": 594, "y": 735}
 {"time_stamp": 21033.3368274, "action": "move", "x": 598, "y": 751}
 {"time_stamp": 21033.3446464, "action": "move", "x": 601, "y": 761}
 {"time_stamp": 21033.3532343, "action": "move", "x": 604, "y": 773}
 {"time_stamp": 21033.3607161, "action": "move", "x": 606, "y": 783}
 {"time_stamp": 21033.3687129, "action": "move", "x": 608, "y": 794}
 {"time_stamp": 21033.3769088, "action": "move", "x": 611, "y": 804}
 {"time_stamp": 21033.3846615, "action": "move", "x": 614, "y": 816}
 {"time_stamp": 21033.3927661, "action": "move", "x": 617, "y": 826}
 {"time_stamp": 21033.4008999, "action": "move", "x": 619, "y": 837}
 {"time_stamp": 21033.408732, "action": "move", "x": 621, "y": 846}
 {"time_stamp": 21033.4169038, "action": "move", "x": 623, "y": 856}
 {"time_stamp": 21033.4250181, "action": "move", "x": 623, "y": 865}
 {"time_stamp": 21033.4329144, "action": "move", "x": 624, "y": 875}
 {"time_stamp": 21033.4410593, "action": "move", "x": 624, "y": 883}
 {"time_stamp": 21033.448994, "action": "move", "x": 626, "y": 891}
 {"time_stamp": 21033.4570193, "action": "move", "x": 626, "y": 899}
 {"time_stamp": 21033.4648038, "action": "move", "x": 627, "y": 906}
 {"time_stamp": 21033.4730101, "action": "move", "x": 628, "y": 913}
 {"time_stamp": 21033.4815421, "action": "move", "x": 631, "y": 920}
 {"time_stamp": 21033.4891275, "action": "move", "x": 635, "y": 926}
 {"time_stamp": 21033.4970011, "action": "move", "x": 639, "y": 930}
 {"time_stamp": 21033.5047772, "action": "move", "x": 647, "y": 935}
 {"time_stamp": 21033.5132552, "action": "move", "x": 653, "y": 939}
 {"time_stamp": 21033.5211245, "action": "move", "x": 659, "y": 943}
 {"time_stamp": 21033.5292347, "action": "move", "x": 665, "y": 947}
 {"time_stamp": 21033.5373088, "action": "move", "x": 671, "y": 950}
 {"time_stamp": 21033.5447875, "action": "move", "x": 677, "y": 955}
 {"time_stamp": 21033.5529495, "action": "move", "x": 684, "y": 960}
 {"time_stamp": 21033.5609559, "action": "move", "x": 690, "y": 965}
 {"time_stamp": 21033.5689335, "action": "move", "x": 696, "y": 971}
 {"time_stamp": 21033.5768783, "action": "move", "x": 700, "y": 977}
 {"time_stamp": 21033.5846548, "action": "move", "x": 703, "y": 981}
 {"time_stamp": 21033.5931357, "action": "move", "x": 705, "y": 985}
 {"time_stamp": 21033.6009205, "action": "move", "x": 707, "y": 988}
 {"time_stamp": 21033.6088781, "action": "move", "x": 708, "y": 991}
 {"time_stamp": 21033.6169713, "action": "move", "x": 709, "y": 994}
 {"time_stamp": 21033.6249134, "action": "move", "x": 709, "y": 997}
 {"time_stamp": 21033.6328882, "action": "move", "x": 710, "y": 999}
 {"time_stamp": 21033.6412016, "action": "move", "x": 711, "y": 1003}
 {"time_stamp": 21033.648939, "action": "move", "x": 711, "y": 1007}
 {"time_stamp": 21033.6572201, "action": "move", "x": 713, "y": 1010}
 {"time_stamp": 21033.6647348, "action": "move", "x": 715, "y": 1013}
 {"time_stamp": 21033.6730325, "action": "move", "x": 716, "y": 1017}
 {"time_stamp": 21033.6810552, "action": "move", "x": 717, "y": 1021}
 {"time_stamp": 21033.6890871, "action": "move", "x": 719, "y": 1024}
 {"time_stamp": 21033.6969594, "action": "move", "x": 720, "y": 1026}
 {"time_stamp": 21033.7048284, "action": "move", "x": 720, "y": 1028}
 {"time_stamp": 21033.7126425, "action": "move", "x": 720, "y": 1028}
 {"time_stamp": 21033.7610156, "action": "move", "x": 720, "y": 1029}
 {"time_stamp": 21033.7693689, "action": "move", "x": 720, "y": 1029}
 {"time_stamp": 21033.7772628, "action": "move", "x": 720, "y": 1030}
 {"time_stamp": 21033.7847737, "action": "move", "x": 720, "y": 1031}
 {"time_stamp": 21033.7929223, "action": "move", "x": 719, "y": 1031}
 {"time_stamp": 21033.801029, "action": "move", "x": 719, "y": 1032}
 {"time_stamp": 21033.808944, "action": "move", "x": 718, "y": 1033}
 {"time_stamp": 21033.8169394, "action": "move", "x": 717, "y": 1035}
 {"time_stamp": 21033.8248771, "action": "move", "x": 716, "y": 1035}
 {"time_stamp": 21033.8334548, "action": "move", "x": 716, "y": 1036}
 {"time_stamp": 21033.8410779, "action": "move", "x": 715, "y": 1037}
 {"time_stamp": 21033.8486117, "action": "move", "x": 715, "y": 1039}
 {"time_stamp": 21033.8568906, "action": "move", "x": 713, "y": 1039}
 {"time_stamp": 21033.8649249, "action": "move", "x": 712, "y": 1040}
 {"time_stamp": 21033.8729566, "action": "move", "x": 712, "y": 1042}
 {"time_stamp": 21033.8810286, "action": "move", "x": 711, "y": 1043}
 {"time_stamp": 21033.8888454, "action": "move", "x": 711, "y": 1044}
 {"time_stamp": 21033.8970736, "action": "move", "x": 709, "y": 1045}
 {"time_stamp": 21033.9051884, "action": "move", "x": 709, "y": 1046}
 {"time_stamp": 21033.91297, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21033.9210518, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21033.9770341, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21033.9932821, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21033.9933595, "action": "click", "x": 709, "y": 1047, "button": "left", "pressed": true}
 {"time_stamp": 21034.0734669, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21034.0737272, "action": "click", "x": 709, "y": 1047, "button": "left", "pressed": false}
 {"time_stamp": 21034.1450402, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21034.1608305, "action": "move", "x": 709, "y": 1047}
 {"time_stamp": 21034.1690642, "action": "move", "x": 709, "y": 1046}
 {"time_stamp": 21034.1770086, "action": "move", "x": 709, "y": 1045}
 {"time_stamp": 21034.1849649, "action": "move", "x": 709, "y": 1044}
 {"time_stamp": 21034.1927171, "action": "move", "x": 709, "y": 1043}
 {"time_stamp": 21034.2008052, "action": "move", "x": 709, "y": 1040}
 {"time_stamp": 21034.2088854, "action": "move", "x": 709, "y": 1038}
 {"time_stamp": 21034.2167939, "action": "move", "x": 709, "y": 1034}
 {"time_stamp": 21034.224882, "action": "move", "x": 709, "y": 1029}
 {"time_stamp": 21034.2327267, "action": "move", "x": 711, "y": 1023}
 {"time_stamp": 21034.2408131, "action": "move", "x": 711, "y": 1016}
 {"time_stamp": 21034.2502186, "action": "move", "x": 712, "y": 1005}
 {"time_stamp": 21034.256732, "action": "move", "x": 713, "y": 991}
 {"time_stamp": 21034.2646169, "action": "move", "x": 716, "y": 976}
 {"time_stamp": 21034.2729272, "action": "move", "x": 719, "y": 955}
 {"time_stamp": 21034.2813953, "action": "move", "x": 722, "y": 929}
 {"time_stamp": 21034.2889074, "action": "move", "x": 723, "y": 899}
 {"time_stamp": 21034.2971538, "action": "move", "x": 725, "y": 871}
 {"time_stamp": 21034.3049341, "action": "move", "x": 727, "y": 838}
 {"time_stamp": 21034.3130394, "action": "move", "x": 727, "y": 805}
 {"time_stamp": 21034.3208269, "action": "move", "x": 728, "y": 771}
 {"time_stamp": 21034.3289492, "action": "move", "x": 728, "y": 742}
 {"time_stamp": 21034.3367866, "action": "move", "x": 728, "y": 714}
 {"time_stamp": 21034.3446895, "action": "move", "x": 728, "y": 686}
 {"time_stamp": 21034.3528319, "action": "move", "x": 728, "y": 662}
 {"time_stamp": 21034.3606113, "action": "move", "x": 728, "y": 643}
 {"time_stamp": 21034.3686987, "action": "move", "x": 727, "y": 620}
 {"time_stamp": 21034.3766536, "action": "move", "x": 725, "y": 605}
 {"time_stamp": 21034.3847084, "action": "move", "x": 722, "y": 589}
 {"time_stamp": 21034.3930586, "action": "move", "x": 719, "y": 576}
 {"time_stamp": 21034.4009346, "action": "move", "x": 716, "y": 565}
 {"time_stamp": 21034.4090089, "action": "move", "x": 712, "y": 554}
 {"time_stamp": 21034.416996, "action": "move", "x": 710, "y": 544}
 {"time_stamp": 21034.4246653, "action": "move", "x": 708, "y": 536}
 {"time_stamp": 21034.4331124, "action": "move", "x": 707, "y": 527}
 {"time_stamp": 21034.4410156, "action": "move", "x": 706, "y": 519}
 {"time_stamp": 21034.4488925, "action": "move", "x": 705, "y": 509}
 {"time_stamp": 21034.4568042, "action": "move", "x": 705, "y": 500}
 {"time_stamp": 21034.4650783, "action": "move", "x": 704, "y": 492}
 {"time_stamp": 21034.472962, "action": "move", "x": 703, "y": 483}
 {"time_stamp": 21034.4809251, "action": "move", "x": 703, "y": 475}
 {"time_stamp": 21034.4889399, "action": "move", "x": 703, "y": 467}
 {"time_stamp": 21034.4968154, "action": "move", "x": 703, "y": 460}
 {"time_stamp": 21034.505111, "action": "move", "x": 703, "y": 454}
 {"time_stamp": 21034.5128327, "action": "move", "x": 703, "y": 446}
 {"time_stamp": 21034.5211697, "action": "move", "x": 704, "y": 439}
 {"time_stamp": 21034.5291453, "action": "move", "x": 704, "y": 432}
 {"time_stamp": 21034.53683, "action": "move", "x": 704, "y": 428}
 {"time_stamp": 21034.5453754, "action": "move", "x": 705, "y": 423}
 {"time_stamp": 21034.5531997, "action": "move", "x": 705, "y": 419}
 {"time_stamp": 21034.5610828, "action": "move", "x": 705, "y": 417}
 {"time_stamp": 21034.568917, "action": "move", "x": 705, "y": 414}
 {"time_stamp": 21034.5768693, "action": "move", "x": 705, "y": 412}
 {"time_stamp": 21034.5849601, "action": "move", "x": 706, "y": 409}
 {"time_stamp": 21034.5930116, "action": "move", "x": 706, "y": 406}
 {"time_stamp": 21034.6006017, "action": "move", "x": 706, "y": 404}
 {"time_stamp": 21034.6086777, "action": "move", "x": 706, "y": 402}
 {"time_stamp": 21034.6167229, "action": "move", "x": 706, "y": 400}
 {"time_stamp": 21034.6251342, "action": "move", "x": 706, "y": 398}
 {"time_stamp": 21034.6325694, "action": "move", "x": 706, "y": 396}
 {"time_stamp": 21034.6407476, "action": "move", "x": 706, "y": 393}
 {"time_stamp": 21034.6489079, "action": "move", "x": 707, "y": 390}
 {"time_stamp": 21034.6567719, "action": "move", "x": 707, "y": 388}
 {"time_stamp": 21034.6648437, "action": "move", "x": 707, "y": 386}
 {"time_stamp": 21034.6735978, "action": "move", "x": 707, "y": 383}
 {"time_stamp": 21034.6808034, "action": "move", "x": 707, "y": 381}
 {"time_stamp": 21034.6887831, "action": "move", "x": 707, "y": 379}
 {"time_stamp": 21034.6968931, "action": "move", "x": 707, "y": 377}
 {"time_stamp": 21034.7048123, "action": "move", "x": 707, "y": 375}
 {"time_stamp": 21034.7127621, "action": "move", "x": 706, "y": 373}
 {"time_stamp": 21034.7208214, "action": "move", "x": 706, "y": 372}
 {"time_stamp": 21034.7289712, "action": "move", "x": 705, "y": 371}
 {"time_stamp": 21034.7366015, "action": "move", "x": 705, "y": 370}
 {"time_stamp": 21034.7449792, "action": "move", "x": 705, "y": 369}
 {"time_stamp": 21034.7528215, "action": "move", "x": 705, "y": 368}
 {"time_stamp": 21034.7611243, "action": "move", "x": 705, "y": 367}
 {"time_stamp": 21034.7689338, "action": "move", "x": 705, "y": 366}
 {"time_stamp": 21034.7768638, "action": "move", "x": 705, "y": 365}
 {"time_stamp": 21034.7849091, "action": "move", "x": 705, "y": 364}
 {"time_stamp": 21034.792848, "action": "move", "x": 705, "y": 363}
 {"time_stamp": 21034.8010344, "action": "move", "x": 705, "y": 362}
 {"time_stamp": 21034.809155, "action": "move", "x": 704, "y": 362}
 {"time_stamp": 21034.8166183, "action": "move", "x": 704, "y": 359}
 {"time_stamp": 21034.8249556, "action": "move", "x": 704, "y": 358}
 {"time_stamp": 21034.8333238, "action": "move", "x": 704, "y": 356}
 {"time_stamp": 21034.8410045, "action": "move", "x": 703, "y": 354}
 {"time_stamp": 21034.8486685, "action": "move", "x": 703, "y": 352}
 {"time_stamp": 21034.857368, "action": "move", "x": 703, "y": 350}
 {"time_stamp": 21034.8647224, "action": "move", "x": 703, "y": 347}
 {"time_stamp": 21034.8730798, "action": "move", "x": 703, "y": 346}
 {"time_stamp": 21034.8809692, "action": "move", "x": 703, "y": 342}
 {"time_stamp": 21034.8889165, "action": "move", "x": 703, "y": 341}
 {"time_stamp": 21034.8969094, "action": "move", "x": 704, "y": 339}
 {"time_stamp": 21034.9052672, "action": "move", "x": 704, "y": 337}
 {"time_stamp": 21034.9145868, "action": "move", "x": 704, "y": 335}
 {"time_stamp": 21034.9208561, "action": "move", "x": 704, "y": 334}
 {"time_stamp": 21034.928931, "action": "move", "x": 704, "y": 333}
 {"time_stamp": 21034.9374176, "action": "move", "x": 704, "y": 332}
 {"time_stamp": 21034.9451258, "action": "move", "x": 704, "y": 330}
 {"time_stamp": 21034.9528709, "action": "move", "x": 704, "y": 329}
 {"time_stamp": 21034.9611476, "action": "move", "x": 704, "y": 328}
 {"time_stamp": 21034.968991, "action": "move", "x": 704, "y": 327}
 {"time_stamp": 21034.9768394, "action": "move", "x": 705, "y": 325}
 {"time_stamp": 21034.9848553, "action": "move", "x": 705, "y": 324}
 {"time_stamp": 21034.993121, "action": "move", "x": 705, "y": 323}
 {"time_stamp": 21035.0007992, "action": "move", "x": 706, "y": 322}
 {"time_stamp": 21035.0088762, "action": "move", "x": 707, "y": 320}
 {"time_stamp": 21035.0166123, "action": "move", "x": 707, "y": 320}
 {"time_stamp": 21035.0247724, "action": "move", "x": 708, "y": 318}
 {"time_stamp": 21035.0335071, "action": "move", "x": 708, "y": 317}
 {"time_stamp": 21035.0411458, "action": "move", "x": 709, "y": 317}
 {"time_stamp": 21035.0491997, "action": "move", "x": 709, "y": 316}
 {"time_stamp": 21035.0569637, "action": "move", "x": 711, "y": 314}
 {"time_stamp": 21035.06496, "action": "move", "x": 711, "y": 313}
 {"time_stamp": 21035.0726588, "action": "move", "x": 712, "y": 312}
 {"time_stamp": 21035.0807214, "action": "move", "x": 713, "y": 311}
 {"time_stamp": 21035.0888078, "action": "move", "x": 713, "y": 309}
 {"time_stamp": 21035.0972443, "action": "move", "x": 713, "y": 309}
 {"time_stamp": 21035.1048868, "action": "move", "x": 714, "y": 308}
 {"time_stamp": 21035.1127551, "action": "move", "x": 715, "y": 307}
 {"time_stamp": 21035.1208842, "action": "move", "x": 715, "y": 306}
 {"time_stamp": 21035.1285261, "action": "move", "x": 715, "y": 306}
 {"time_stamp": 21035.1366862, "action": "move", "x": 715, "y": 305}
 {"time_stamp": 21035.1446592, "action": "move", "x": 716, "y": 305}
 {"time_stamp": 21035.1528109, "action": "move", "x": 716, "y": 305}
 {"time_stamp": 21035.1848109, "action": "move", "x": 716, "y": 304}
 {"time_stamp": 21035.208994, "action": "move", "x": 717, "y": 304}
 {"time_stamp": 21035.2571327, "action": "move", "x": 717, "y": 304}
 {"time_stamp": 21035.2573543, "action": "click", "x": 717, "y": 304, "button": "left", "pressed": true}
 {"time_stamp": 21035.3377191, "action": "move", "x": 717, "y": 304}
 {"time_stamp": 21035.3379572, "action": "click", "x": 717, "y": 304, "button": "left", "pressed": false}
--- a/utils/image_processing/contour.py
+++ b/utils/image_processing/contour.py
@@ -0,0 +1,34 @@
 import cv2
 from matplotlib import pyplot as plt
 # Load the image
 image = cv2.imread('../../mm_agents/stackoverflow.png')
 # Convert to grayscale
 gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
 # Apply adaptive thresholding to get a binary image
 thresh = cv2.adaptiveThreshold(
    gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV, 11, 2
 )
 # Find contours
 contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
 # Filter out contours that are not of cell size
 # This is done by assuming that cells will have a relatively standard size
 # The size filter is just a placeholder, real values depend on the actual image size
 min_cell_size = 500
 max_cell_size = 5000
 cell_contours = [cnt for cnt in contours if min_cell_size < cv2.contourArea(cnt) < max_cell_size]
 # Draw contours on the image
 contour_output = image.copy()
 cv2.drawContours(contour_output, cell_contours, -1, (0, 255, 0), 2)
 # Display the image with cell contours
 plt.figure(figsize=(12,6))
 plt.imshow(cv2.cvtColor(contour_output, cv2.COLOR_BGR2RGB))
 plt.title('Spreadsheet with Cell Contours')
 plt.axis('off')
 plt.show()
--- a/utils/image_processing/point_marking.py
+++ b/utils/image_processing/point_marking.py
@@ -0,0 +1,32 @@
 from PIL import Image, ImageDraw
 def mark_point(image_path: str, x: int, y: int, radius: int = 5, color: str = 'red') -> str:
    """
    Mark a point on an image and save the image.
    """
    # Load the image
    image = Image.open(image_path)
    # Create a draw object
    draw = ImageDraw.Draw(image)
    # Draw a small circle to mark the point
    draw.ellipse((x - radius, y - radius, x + radius, y + radius), fill=color, outline=color)
    # Save the image with the point marked
    marked_image_path = image_path[:-4] + '_marked' + image_path[-4:]
    image.save(marked_image_path)
    return marked_image_path
 if __name__ == '__main__':
    image_path = 'chrome_start.png'
    x = 100
    y = 200
    radius = 30
    color = 'red'
    marked_image_path = mark_point(image_path, x, y, radius, color)
    print(f"Marked image saved to {marked_image_path}")