Improve: fix bugs; add back the cursor in screenshot; add pause in env.step

2023-12-02 22:14:50 +08:00
parent e51ef4b91d
commit 487fb8005b
5 changed files with 36 additions and 6 deletions
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -4,7 +4,7 @@ import requests


 class PythonController:
-    def __init__(self, http_server: str, pkgs_prefix: str = "py -c \"import pyautogui; {command}\""):
+    def __init__(self, http_server: str, pkgs_prefix: str = "python -c \"import pyautogui; {command}\""):
        self.http_server = http_server
        self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages

--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -85,10 +85,13 @@ class DesktopEnv(gym.Env):
        observation = self._get_obs()
        return observation

-    def step(self, action):
+    def step(self, action, pause=0.5):
        # todo: support both the action space of our-designed space and the executable code space in pyautogui
        # Our action space is the set of all possible python commands insides `pyautogui`
        self.controller.execute_python_command(action)
+
+        # todo: maybe for the better here we need to add a logic to wait until the rendering is done
+        time.sleep(pause)
        observation = self._get_obs()
        reward = 0  # todo: Define reward calculation for each example
        done = False  # todo: Define episode termination condition for each example
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -1,12 +1,14 @@
 import os
 import platform
 import subprocess
+import requests

 import Xlib.display
 import pyautogui
-from PIL import ImageGrab
+from PIL import ImageGrab, Image
 from flask import Flask, request, jsonify, send_file

+
 app = Flask(__name__)

 pyautogui.PAUSE = 0
@@ -43,7 +45,19 @@ def capture_screen_with_cursor():
    os.makedirs(os.path.dirname(file_path), exist_ok=True)

    if user_platform == "Windows":
+        def _download_image(url, path):
+            response = requests.get(url)
+            with open(path, 'wb') as file:
+                file.write(response.content)
+
+        cursor_path = os.path.join("screenshots", "cursor.png")
+        if not os.path.exists(cursor_path):
+            cursor_url = "https://vip.helloimg.com/images/2023/12/02/oQPzmt.png"
+            _download_image(cursor_url, cursor_path)
        screenshot = pyautogui.screenshot()
+        cursor_x, cursor_y = pyautogui.position()
+        cursor = Image.open(cursor_path)
+        screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
        screenshot.save(file_path)
    elif user_platform == "Linux":
        # Use xlib to prevent scrot dependency for Linux
@@ -60,5 +74,15 @@ def capture_screen_with_cursor():
    return send_file(file_path, mimetype='image/png')


+@app.route('/platform', methods=['GET'])
+def get_platform():
+    return platform.system()
+
+
+@app.route('/cursor_position', methods=['GET'])
+def get_cursor_position():
+    return pyautogui.position().x, pyautogui.position().y
+
+
 if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0")
--- a/desktop_env/server/requirements.txt
+++ b/desktop_env/server/requirements.txt
@@ -2,3 +2,4 @@ python3-xlib==0.15
 PyAutoGUI==0.9.54
 Pillow==10.1.0
 git+https://github.com/moses-palmer/pynput.git@refs/pull/541/head # to make sure that it works on Apple Silicon
+requests
--- a/main.py
+++ b/main.py
@@ -10,13 +10,15 @@ def human_agent():
        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
        #  host="192.168.7.128",
        host="http://192.168.13.128:5000",
+        snapshot_path="base3",
    )

    # reset the environment to certain snapshot
-    # observation = env.reset()
+    observation = env.reset()
    done = False

-    while not done:
+
+    for i in range(2):
        # action = get_human_action()

        # action = {
@@ -24,7 +26,7 @@ def human_agent():
        #     "click_type": 3,
        # }

-        action = "pyautogui.dragTo(100, 200, button='left')"
+        action = "pyautogui.moveTo(10, 100)" if i == 0 else "pyautogui.click(button='right')"

        observation, reward, done, info = env.step(action)
        print("Observation:", observation)