From 992d8f8fcedfe801a688a72bcaf8977742a5acd0 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Sat, 2 Dec 2023 17:52:00 +0800
Subject: [PATCH] Refactor with pyautogui

---
 SERVER_SETUP.md                     |  21 +--
 desktop_env/controllers/keyboard.py |  56 --------
 desktop_env/controllers/mouse.py    | 162 -----------------------
 desktop_env/controllers/python.py   |  40 +++---
 desktop_env/controllers/xdotool.py  |  11 --
 desktop_env/envs/desktop_env.py     | 194 ++++------------------------
 desktop_env/server/main.py          |  64 +++++++++
 desktop_env/windows_server/main.py  |  27 ----
 gpt_4v_agent_exp.py                 |  19 ++-
 main.py                             |  63 +++------
 mm_agents/gpt_4v_agent.py           |   2 +
 11 files changed, 144 insertions(+), 515 deletions(-)
 delete mode 100644 desktop_env/controllers/keyboard.py
 delete mode 100644 desktop_env/controllers/mouse.py
 delete mode 100644 desktop_env/controllers/xdotool.py
 create mode 100644 desktop_env/server/main.py
 delete mode 100644 desktop_env/windows_server/main.py
diff --git a/SERVER_SETUP.md b/SERVER_SETUP.md
index f67c01e..231bd9e 100644
--- a/SERVER_SETUP.md
+++ b/SERVER_SETUP.md
@@ -1,23 +1,6 @@
 # Server Setup Guide
 
-- [Linux](#linux)
-- [Windows](#windows)
-
-## Linux
-
-<https://averagelinuxuser.com/ssh-into-virtualbox/>
-
-1. `sudo apt install openssh-server`
-2. `sudo systemctl enable ssh --now`
-3. `sudo ufw disable` (disable firewall - safe for local network, otherwise `sudo ufw allow ssh`)
-4. `ip a` - find ip address
-5. ssh username@<ip_address>
-6. On host, run `ssh-copy-id <username>@<ip_address>`
-
-
-## Windows
-
-1. Copy and paste the file `windows_server/main.py` to the windows vm
-2. Make sure `mouse` and `keyboard` are installed
+1. Copy and paste the file `server/main.py` to the windows vm
+2. Install the requirements `pip install -r requirements.txt`
 3. Run the file `python main.py`
 4. `ipconfig /all` and find the ip address
\ No newline at end of file
diff --git a/desktop_env/controllers/keyboard.py b/desktop_env/controllers/keyboard.py
deleted file mode 100644
index 5178491..0000000
--- a/desktop_env/controllers/keyboard.py
+++ /dev/null
@@ -1,56 +0,0 @@
-from abc import ABC, abstractmethod
-from fabric import Connection
-
-from .xdotool import XDoToolController
-from .python import PythonController
-
-class AbstractKeyboardController(ABC):
-    @abstractmethod
-    def type(self, text: str):
-        raise NotImplementedError
-
-    @abstractmethod
-    def key(self, key: str):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def key_down(self, key: str):
-        raise NotImplementedError
-
-    @abstractmethod
-    def key_up(self, key: str):
-        raise NotImplementedError
-
-
-class XDoToolKeyboardController(AbstractKeyboardController, XDoToolController):
-    def __init__(self, ssh_connection: Connection):
-        super().__init__(ssh_connection=ssh_connection)
-
-    def type(self, text: str):
-        self._execute_xdotool_command(f"type {text}")
-
-    def key(self, key: str):
-        self._execute_xdotool_command(f"key {key}")
-
-    def key_down(self, key: str):
-        self._execute_xdotool_command(f"keydown {key}")
-
-    def key_up(self, key: str):
-        self._execute_xdotool_command(f"keyup {key}")
-
-class PythonKeyboardController(AbstractKeyboardController, PythonController):
-    def __init__(self, http_server: str):
-        super().__init__(http_server=http_server)
-        self.command = "python -c \"import keyboard; {command}\""
-
-    def type(self, text: str):
-        self._execute_python_command(self.command.format(command=f"keyboard.write('{text}')"))
-
-    def key(self, key: str):
-        self._execute_python_command(self.command.format(command=f"keyboard.press_and_release('{key}')"))
-
-    def key_down(self, key: str):
-        self._execute_python_command(self.command.format(command=f"keyboard.press('{key}')"))
-
-    def key_up(self, key: str):
-        self._execute_python_command(self.command.format(command=f"keyboard.release('{key}')"))
diff --git a/desktop_env/controllers/mouse.py b/desktop_env/controllers/mouse.py
deleted file mode 100644
index 833f4ee..0000000
--- a/desktop_env/controllers/mouse.py
+++ /dev/null
@@ -1,162 +0,0 @@
-from enum import Enum
-
-from abc import ABC, abstractmethod
-from fabric import Connection
-import re
-
-from .xdotool import XDoToolController
-from .python import PythonController
-class MouseClick(Enum):
-    LEFT = 1
-    MIDDLE = 2
-    RIGHT = 3
-    WHEEL_UP = 4
-    WHEEL_DOWN = 5
-
-class AbstractMouseController(ABC):
-    @abstractmethod
-    def get_mouse(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def mouse_move(self, x: int, y: int):
-        raise NotImplementedError
-
-    @abstractmethod
-    def left_down(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def left_up(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def left_click(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def middle_down(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def middle_up(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def middle_click(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def right_down(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def right_up(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def right_click(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def scroll_up(self):
-        raise NotImplementedError
-    
-    @abstractmethod
-    def scroll_down(self):
-        raise NotImplementedError
-
-
-class XDoToolMouseController(AbstractMouseController, XDoToolController):
-    def __init__(self, ssh_connection: Connection):
-        super().__init__(ssh_connection=ssh_connection)
-
-    def get_mouse(self):
-        output = self._execute_xdotool_command(f"")
-        parts = output.split(" ")
-        x = int(parts[0].split(":")[1])
-        y = int(parts[1].split(":")[1])
-        return x, y
-
-    def mouse_move(self, x: int, y: int):
-        self._execute_xdotool_command(f"mousemove {x} {y}")
-
-    def left_down(self):
-        self._execute_xdotool_command(f"mousedown 1")
-    
-    def left_up(self):
-        self._execute_xdotool_command(f"mouseup 1")
-
-    def left_click(self):
-        self._execute_xdotool_command(f"click 1")
-
-    def middle_down(self):
-        self._execute_xdotool_command(f"mousedown 2")
-    
-    def middle_up(self):
-        self._execute_xdotool_command(f"mouseup 2")
-
-    def middle_click(self):
-        self._execute_xdotool_command(f"click 2")
-    
-    def right_down(self):
-        self._execute_xdotool_command(f"mousedown 3")
-    
-    def right_up(self):
-        self._execute_xdotool_command(f"mouseup 3")
-    
-    def right_click(self):
-        self._execute_xdotool_command(f"click 3")
-    
-    def scroll_up(self):
-        self._execute_xdotool_command(f"click 4")
-    
-    def scroll_down(self):
-        self._execute_xdotool_command(f"click 5")
-
-class PythonMouseController(AbstractMouseController, PythonController):
-    def __init__(self, http_server: str):
-        super().__init__(http_server=http_server)
-        self.command = "py -c \"import mouse; {command}\""
-
-    def get_mouse(self):
-        response = self._execute_python_command(self.command.format(command=f"print(mouse.get_position())"))
-        numbers = re.findall(r'-?\d+', response["output"])
-        x, y = map(int, numbers)
-        return x, y
-
-    def mouse_move(self, x: int, y: int):
-        self._execute_python_command(self.command.format(command=f"mouse.move({x}, {y})"))
-
-    def left_down(self):
-        self._execute_python_command(self.command.format(command="mouse.press(button='left')"))
-    
-    def left_up(self):
-        self._execute_python_command(self.command.format(command="mouse.release(button='left')"))
-
-    def left_click(self):
-        self._execute_python_command(self.command.format(command="mouse.click(button='left')"))
-
-    def middle_down(self):
-        self._execute_python_command(self.command.format(command="mouse.press(button='middle')"))
-    
-    def middle_up(self):
-        self._execute_python_command(self.command.format(command="mouse.release(button='middle')"))
-
-    def middle_click(self):
-        self._execute_python_command(self.command.format(command="mouse.click(button='middle')"))
-    
-    def right_down(self):
-        self._execute_python_command(self.command.format(command="mouse.press(button='right')"))
-    
-    def right_up(self):
-        self._execute_python_command(self.command.format(command="mouse.release(button='right')"))
-    
-    def right_click(self):
-        self._execute_python_command(self.command.format(command="mouse.click(button='right')"))
-    
-    def scroll_up(self):
-        self._execute_python_command(self.command.format(command="mouse.wheel(10)"))
-    
-    def scroll_down(self):
-        self._execute_python_command(self.command.format(command="mouse.wheel(-10)"))
\ No newline at end of file
diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 1e6a751..2e2c804 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -1,15 +1,31 @@
-import requests
 import json
 
+import requests
+
 
 class PythonController:
-    def __init__(self, http_server: str):
+    def __init__(self, http_server: str, pkgs_prefix: str = "py -c \"import pyautogui; {command}\""):
         self.http_server = http_server
+        self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
 
-    def _execute_python_command(self, command: str) -> None:
-        payload = json.dumps({
-            "command": command
-        })
+    def get_screenshot(self):
+        """
+        Gets a screenshot from the server. With the cursor.
+        """
+        response = requests.get(self.http_server + "/screenshot")
+        if response.status_code == 200:
+            return response.content
+        else:
+            print("Failed to get screenshot. Status code:", response.status_code)
+            return None
+
+    def execute_python_command(self, command: str) -> None:
+        """
+        Executes a python command on the server.
+        It can be used to execute the pyautogui commands, or... any other python command. who knows?
+        """
+        command = self.pkgs_prefix.format(command=command)
+        payload = json.dumps({"command": command})
         headers = {
             'Content-Type': 'application/json'
         }
@@ -23,15 +39,3 @@ class PythonController:
             return response.json()
         except requests.exceptions.RequestException as e:
             print("An error occurred while trying to execute the command:", e)
-
-
-# example usage
-if __name__ == '__main__':
-    # replace with your actual server URL of the vm
-    server_url = "http://192.168.7.129:5000"
-    controller = PythonController(server_url)
-
-    # example commands    
-    python_command = "python -c \"import keyboard; keyboard.write('hello world')\""
-    python_command = "python -c \"import mouse; mouse.move(100,100);mouse.right_click()\""
-    controller._execute_python_command(python_command)
diff --git a/desktop_env/controllers/xdotool.py b/desktop_env/controllers/xdotool.py
deleted file mode 100644
index abb268f..0000000
--- a/desktop_env/controllers/xdotool.py
+++ /dev/null
@@ -1,11 +0,0 @@
-from fabric import Connection
-from typing import List
-
-
-class XDoToolController:
-    def __init__(self, ssh_connection: Connection):
-        self.ssh_connection = ssh_connection
-    
-    def _execute_xdotool_command(self, command: List[str]) -> None:
-        result = self.ssh_connection.run(f"DISPLAY=:0 xdotool {command}", hide=True)
-        return result.stdout.strip()
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index d032f87..5485fe0 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,34 +1,20 @@
+from __future__ import annotations
+
 import os
-from enum import Enum
-from typing import Literal, List, Tuple
 import subprocess
-from fabric import Connection
 import time
+import uuid
+from typing import List
 
 import gymnasium as gym
-from gymnasium import spaces
-import numpy as np
-import uuid
-from PIL import Image
 
-from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
-    PythonMouseController
-from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
-    PythonKeyboardController
+from desktop_env.controllers.python import PythonController
 
 
-class Action(Enum):
-    CLICK = 0
-    MOUSE_DOWN = 1
-    MOUSE_UP = 2
-    MOUSE_MOVE = 3
-    KEY = 4
-    KEY_DOWN = 5
-    KEY_UP = 6
-    TYPE = 7
-
-
-VM_TYPE = Literal['ubuntu', 'windows']
+def _execute_command(command: List[str]) -> None:
+    result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
+    if result.returncode != 0:
+        raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
 
 
 class DesktopEnv(gym.Env):
@@ -37,67 +23,20 @@ class DesktopEnv(gym.Env):
     def __init__(
             self,
             path_to_vm: str,
-            username: str,
-            password: str = None,
             host: str = "192.168.7.128:5000",
             snapshot_path: str = "base",
-            vm_os: VM_TYPE = "ubuntu"
     ):
-
-        # The path to the vmx file of your vm
+        # Initialize environment variables
         self.path_to_vm = path_to_vm
-
-        # username and password for your vm
-        self.username = username
-        self.password = password
-
         self.host = host
         self.snapshot_path = snapshot_path  # todo: handling the logic of snapshot directory
 
-        # Initialize emulator
+        # Initialize emulator and controller
         print("Initializing...")
         self._start_emulator()
+        self.controller = PythonController(http_server=self.host)
 
-        # set up controllers
-        self.mouse_controller, self.keyboard_controller = self._create_controllers(vm_os)
-
-        # Get the screen size
-        self.screen_width, self.screen_height = self._get_screensize()
-
-        # Define the action and observation space
-        self.action_space = spaces.Dict({
-            "action_type": spaces.Discrete(len(Action)),
-            "click_type": spaces.Discrete(len(MouseClick)),
-            "x": spaces.Discrete(self.screen_width),
-            "y": spaces.Discrete(self.screen_height),
-            "key": spaces.MultiDiscrete([128] * 10),  # max 10 characters, ASCII
-            "text": spaces.MultiDiscrete([128] * 10)  # max 10 characters, ASCII
-        })
-
-        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
-                                            dtype=np.uint8)
-
-        # Additional setup
-        self.metadata = {'render.modes': ['rgb_array']}
-
-
-    def _get_screensize(self):
-        screenshot_path = self._get_obs()
-        img = Image.open(screenshot_path)
-        return img.size
-
-    def _create_controllers(self, vm_os: VM_TYPE) -> Tuple[AbstractMouseController, AbstractKeyboardController]:
-        if vm_os == "ubuntu":
-            ssh_connection = Connection(host=self.host, user=self.username, connect_kwargs={"password": self.password})
-            mouse_controller = XDoToolMouseController(ssh_connection)
-            keyboard_controller = XDoToolKeyboardController(ssh_connection)
-        elif vm_os == "windows":
-            mouse_controller = PythonMouseController(http_server=self.host)
-            keyboard_controller = PythonKeyboardController(http_server=self.host)
-        else:
-            raise NotImplementedError(vm_os)
-
-        return mouse_controller, keyboard_controller
+        # todo: define the action space and the observation space as gym did
 
     def _start_emulator(self):
         while True:
@@ -109,52 +48,35 @@ class DesktopEnv(gym.Env):
                     break
                 else:
                     print("Starting VM...")
-                    self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
+                    _execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
                     time.sleep(10)
             except subprocess.CalledProcessError as e:
                 print(f"Error executing command: {e.output.decode().strip()}")
 
-    def _execute_command(self, command: List[str]) -> None:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
-        if result.returncode != 0:
-            raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
-
     def _save_state(self):
-        self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
 
     def _get_screenshot(self):
         random_uuid = str(uuid.uuid4())
         os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
         image_path = os.path.join("tmp", random_uuid, "screenshot.png")
 
-        if self.password:
-            self._execute_command(
-                ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm, image_path])
-        else:
-            self._execute_command(
-                ["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
+        # Get the screenshot and save to the image_path
+        screenshot = self.controller.get_screenshot()
+        with open(image_path, "wb") as f:
+            f.write(screenshot)
 
         return image_path
 
     def _get_obs(self):
         screenshot_image_path = self._get_screenshot()
-        self._add_cursor(screenshot_image_path)
         return screenshot_image_path
-    
-    def _add_cursor(self, img_path: str):
-        x, y = self.mouse_controller.get_mouse()
-        cursor_image = Image.open("./desktop_env/assets/cursor.png")
-        cursor_image = cursor_image.resize((int(cursor_image.width / 2), int(cursor_image.height / 2)))
 
-        screenshot = Image.open(img_path)
-        screenshot.paste(cursor_image, (x, y), cursor_image)
-        screenshot.save(img_path)
-
-    def reset(self):
+    def reset(self, seed=None, options=None):
         print("Resetting environment...")
 
         print("Reverting to snapshot to {}...".format(self.snapshot_path))
-        self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
         time.sleep(5)
 
         print("Starting emulator...")
@@ -165,75 +87,11 @@ class DesktopEnv(gym.Env):
         return observation
 
     def step(self, action):
-        if isinstance(action, list):
-            for a in action:
-                observation, reward, done, info = self.step(a)
-            return observation, reward, done, info
-
-        # todo: handle the case when the action is not a single action
-        try:
-            action_type = Action(action['action_type'])
-        except KeyError:
-            done = True
-            return self._get_obs(), 0, done, {}
-
-        if action_type == Action.CLICK:
-            click = MouseClick(action['click_type'])
-            if click == MouseClick.LEFT:
-                self.mouse_controller.left_click()
-            elif click == MouseClick.MIDDLE:
-                self.mouse_controller.middle_click()
-            elif click == MouseClick.RIGHT:
-                self.mouse_controller.right_click()
-            elif click == MouseClick.WHEEL_UP:
-                self.mouse_controller.scroll_up()
-            elif click == MouseClick.WHEEL_DOWN:
-                self.mouse_controller.scroll_down()
-        elif action_type == Action.MOUSE_DOWN:
-            click = MouseClick(action['click_type'])
-            if click == MouseClick.LEFT:
-                self.mouse_controller.left_down()
-            elif click == MouseClick.MIDDLE:
-                self.mouse_controller.middle_down()
-            elif click == MouseClick.RIGHT:
-                self.mouse_controller.right_down()
-            elif click == MouseClick.WHEEL_UP:
-                self.mouse_controller.scroll_up()
-            elif click == MouseClick.WHEEL_DOWN:
-                self.mouse_controller.scroll_down()
-        elif action_type == Action.MOUSE_UP:
-            click = MouseClick(action['click_type'])
-            if click == MouseClick.LEFT:
-                self.mouse_controller.left_up()
-            elif click == MouseClick.MIDDLE:
-                self.mouse_controller.middle_up()
-            elif click == MouseClick.RIGHT:
-                self.mouse_controller.right_up()
-            elif click == MouseClick.WHEEL_UP:
-                self.mouse_controller.scroll_up()
-            elif click == MouseClick.WHEEL_DOWN:
-                self.mouse_controller.scroll_down()
-        elif action_type == Action.MOUSE_MOVE:
-            self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
-        elif action_type == Action.KEY:
-            self.keyboard_controller.key(action['key'])
-        elif action_type == Action.KEY_DOWN:
-            self.keyboard_controller.key_down(action['key'])
-        elif action_type == Action.KEY_UP:
-            self.keyboard_controller.key_up(action['key'])
-        elif action_type == Action.TYPE:
-            for key in action['text']:
-                if key == "\r" or key == "\n":
-                    self.keyboard_controller.key("enter")
-                else:
-                    self.keyboard_controller.key(key)
-                # sleep for 0.05 seconds with some random noise
-                time.sleep(0.05 + np.random.normal(0, 0.01))
-
-        # Capture new state
+        # Our action space is the set of all possible python commands insides `pyautogui`
+        self.controller.execute_python_command(action)
         observation = self._get_obs()
-        reward = 0  # Define reward calculation
-        done = False  # Define episode termination condition
+        reward = 0  # todo: Define reward calculation for each example
+        done = False  # todo: Define episode termination condition for each example
         info = {}
         return observation, reward, done, info
 
@@ -244,4 +102,4 @@ class DesktopEnv(gym.Env):
             raise ValueError('Unsupported render mode: {}'.format(mode))
 
     def close(self):
-        self._execute_command(["vmrun", "stop", self.path_to_vm])
+        _execute_command(["vmrun", "stop", self.path_to_vm])
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
new file mode 100644
index 0000000..228a08d
--- /dev/null
+++ b/desktop_env/server/main.py
@@ -0,0 +1,64 @@
+import os
+import platform
+import subprocess
+
+import Xlib.display
+import pyautogui
+from PIL import ImageGrab
+from flask import Flask, request, jsonify, send_file
+
+app = Flask(__name__)
+
+pyautogui.PAUSE = 0
+pyautogui.DARWIN_CATCH_UP_TIME = 0
+
+
+@app.route('/execute', methods=['POST'])
+def execute_command():
+    data = request.json
+    # The 'command' key in the JSON request should contain the command to be executed.
+    command = data.get('command', '')
+
+    # Execute the command without any safety checks.
+    try:
+        result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
+        return jsonify({
+            'status': 'success',
+            'output': result.stdout,
+            'error': result.stderr
+        })
+    except Exception as e:
+        return jsonify({
+            'status': 'error',
+            'message': str(e)
+        }), 500
+
+
+@app.route('/screenshot', methods=['GET'])
+def capture_screen_with_cursor():
+    file_path = os.path.join("screenshots", "screenshot.png")
+    user_platform = platform.system()
+
+    # Ensure the screenshots directory exists
+    os.makedirs(os.path.dirname(file_path), exist_ok=True)
+
+    if user_platform == "Windows":
+        screenshot = pyautogui.screenshot()
+        screenshot.save(file_path)
+    elif user_platform == "Linux":
+        # Use xlib to prevent scrot dependency for Linux
+        screen = Xlib.display.Display().screen()
+        size = screen.width_in_pixels, screen.height_in_pixels
+        screenshot = ImageGrab.grab(bbox=(0, 0, size[0], size[1]))
+        screenshot.save(file_path)
+    elif user_platform == "Darwin":  # (Mac OS)
+        # Use the screencapture utility to capture the screen with the cursor
+        subprocess.run(["screencapture", "-C", file_path])
+    else:
+        print(f"The platform you're using ({user_platform}) is not currently supported")
+
+    return send_file(file_path, mimetype='image/png')
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host="0.0.0.0")
diff --git a/desktop_env/windows_server/main.py b/desktop_env/windows_server/main.py
deleted file mode 100644
index 098dca5..0000000
--- a/desktop_env/windows_server/main.py
+++ /dev/null
@@ -1,27 +0,0 @@
-from flask import Flask, request, jsonify
-import subprocess
-
-app = Flask(__name__)
-
-@app.route('/execute', methods=['POST'])
-def execute_command():
-    data = request.json
-    # The 'command' key in the JSON request should contain the command to be executed.
-    command = data.get('command', '')
-
-    # Execute the command without any safety checks.
-    try:
-        result = subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
-        return jsonify({
-            'status': 'success',
-            'output': result.stdout,
-            'error': result.stderr
-        })
-    except Exception as e:
-        return jsonify({
-            'status': 'error',
-            'message': str(e)
-        }), 500
-
-if __name__ == '__main__':
-    app.run(debug=True, host="0.0.0.0")
diff --git a/gpt_4v_agent_exp.py b/gpt_4v_agent_exp.py
index bae0446..12b5dd0 100644
--- a/gpt_4v_agent_exp.py
+++ b/gpt_4v_agent_exp.py
@@ -8,29 +8,26 @@ import uuid
 def gpt_4v_agent():
     api_key = os.environ.get("OPENAI_API_KEY")
 
-    # meta_info = {
-    #     "instruction": "Open WSJ website to get latest news",
-    #     "task_name": "open_wsj",
-    #     "snapshot_path": "base",
-    # }
-
     meta_info = {
-        "instruction": "Clear the recycle bin",
-        "task_name": "clean_recycle_bin",
+        "instruction": "Open WSJ website to get latest news",
+        "task_name": "open_wsj",
         "snapshot_path": "base",
     }
 
+    # meta_info = {
+    #     "instruction": "Clear the recycle bin",
+    #     "task_name": "clean_recycle_bin",
+    #     "snapshot_path": "base",
+    # }
+
     agent = GPT4v_Agent(api_key=api_key, instruction=meta_info["instruction"])
     env = DesktopEnv(
         path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""",
         # automitically load the snapshot and start the vm
         #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
         snapshot_path="base",
-        username="tianbaox",
-        password="951753",
         #  host="192.168.7.128",
         host="http://192.168.13.128:5000",
-        vm_os="windows"
     )
 
     # reset the environment to certain snapshot
diff --git a/main.py b/main.py
index 2cbcd18..57c1926 100644
--- a/main.py
+++ b/main.py
@@ -1,55 +1,31 @@
-from pprint import pprint
-from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
-
-def get_human_action():
-    """
-    Prompts the human player for an action and returns a structured action.
-    """
-    print("\nAvailable actions:", [action.name for action in Action])
-    action_type = None
-    while action_type not in [action.value for action in Action]:
-        action_type = Action[input("Enter the type of action: ".strip())].value
-
-    action = {"action_type": action_type}
-
-    if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-        print("\n Available clicks:", [action.name for action in MouseClick])
-        click_type = input("Enter click type: ")
-        action["click_type"] = MouseClick[click_type].value
-
-    if action_type == Action.MOUSE_MOVE.value:
-        x = int(input("Enter x-coordinate for mouse move: "))
-        y = int(input("Enter y-coordinate for mouse move: "))
-        action["x"] = x
-        action["y"] = y
-
-    if action_type == Action.KEY.value:
-        key = input("Enter the key to press: ")
-        action["key"] = [ord(c) for c in key]
-
-    if action_type == Action.TYPE.value:
-        text = input("Enter the text to type: ")
-        action["text"] = [ord(c) for c in text]
-
-    return action
+from desktop_env.envs.desktop_env import DesktopEnv
 
 
 def human_agent():
     """
     Runs the Gym environment with human input.
     """
-    env = DesktopEnv(path_to_vm="/home/yuri/vmware/Windows 10 x64/Windows 10 x64.vmx",
-                    #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
-                 username="user",
-                 password="password",
-                #  host="192.168.7.128",
-                 host="http://192.168.7.129:5000",
-                 vm_os="windows")
-    observation = env.reset()
+    env = DesktopEnv(
+        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""",
+        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+        #  host="192.168.7.128",
+        host="http://192.168.13.128:5000",
+    )
+
+    # reset the environment to certain snapshot
+    # observation = env.reset()
     done = False
 
     while not done:
-        action = get_human_action()
+        # action = get_human_action()
+
+        # action = {
+        #     "action_type": 0,
+        #     "click_type": 3,
+        # }
+
+        action = "pyautogui.dragTo(100, 200, button='left')"
+
         observation, reward, done, info = env.step(action)
         print("Observation:", observation)
         print("Reward:", reward)
@@ -64,5 +40,6 @@ def human_agent():
     env.close()
     print("Environment closed.")
 
+
 if __name__ == "__main__":
     human_agent()
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index c52b9c9..52b4dcf 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -88,6 +88,8 @@ class GPT4v_Agent:
         traj_to_show = []
         for i in range(len(self.trajectory)):
             traj_to_show.append(self.trajectory[i]["content"][0]["text"])
+            if len(self.trajectory[i]["content"]) > 1:
+                traj_to_show.append("screenshot_obs")
         print("Trajectory:", traj_to_show)
         payload = {
             "model": self.model,