From 487fb8005b8af873e844993d2a13024b8e7ef6fc Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Sat, 2 Dec 2023 22:14:50 +0800
Subject: [PATCH] Improve: fix bugs; add back the cursor in screenshot; add
 pause in env.step

---
 desktop_env/controllers/python.py   |  2 +-
 desktop_env/envs/desktop_env.py     |  5 ++++-
 desktop_env/server/main.py          | 26 +++++++++++++++++++++++++-
 desktop_env/server/requirements.txt |  1 +
 main.py                             |  8 +++++---
 5 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 39d1196..f2021c0 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -4,7 +4,7 @@ import requests
 
 
 class PythonController:
-    def __init__(self, http_server: str, pkgs_prefix: str = "py -c \"import pyautogui; {command}\""):
+    def __init__(self, http_server: str, pkgs_prefix: str = "python -c \"import pyautogui; {command}\""):
         self.http_server = http_server
         self.pkgs_prefix = pkgs_prefix  # fixme: this is a hacky way to execute python commands. fix it and combine it with installation of packages
 
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 3fbb92f..16b73cf 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -85,10 +85,13 @@ class DesktopEnv(gym.Env):
         observation = self._get_obs()
         return observation
 
-    def step(self, action):
+    def step(self, action, pause=0.5):
         # todo: support both the action space of our-designed space and the executable code space in pyautogui
         # Our action space is the set of all possible python commands insides `pyautogui`
         self.controller.execute_python_command(action)
+
+        # todo: maybe for the better here we need to add a logic to wait until the rendering is done
+        time.sleep(pause)
         observation = self._get_obs()
         reward = 0  # todo: Define reward calculation for each example
         done = False  # todo: Define episode termination condition for each example
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index 228a08d..f9e8dcd 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -1,12 +1,14 @@
 import os
 import platform
 import subprocess
+import requests
 
 import Xlib.display
 import pyautogui
-from PIL import ImageGrab
+from PIL import ImageGrab, Image
 from flask import Flask, request, jsonify, send_file
 
+
 app = Flask(__name__)
 
 pyautogui.PAUSE = 0
@@ -43,7 +45,19 @@ def capture_screen_with_cursor():
     os.makedirs(os.path.dirname(file_path), exist_ok=True)
 
     if user_platform == "Windows":
+        def _download_image(url, path):
+            response = requests.get(url)
+            with open(path, 'wb') as file:
+                file.write(response.content)
+
+        cursor_path = os.path.join("screenshots", "cursor.png")
+        if not os.path.exists(cursor_path):
+            cursor_url = "https://vip.helloimg.com/images/2023/12/02/oQPzmt.png"
+            _download_image(cursor_url, cursor_path)
         screenshot = pyautogui.screenshot()
+        cursor_x, cursor_y = pyautogui.position()
+        cursor = Image.open(cursor_path)
+        screenshot.paste(cursor, (cursor_x, cursor_y), cursor)
         screenshot.save(file_path)
     elif user_platform == "Linux":
         # Use xlib to prevent scrot dependency for Linux
@@ -60,5 +74,15 @@ def capture_screen_with_cursor():
     return send_file(file_path, mimetype='image/png')
 
 
+@app.route('/platform', methods=['GET'])
+def get_platform():
+    return platform.system()
+
+
+@app.route('/cursor_position', methods=['GET'])
+def get_cursor_position():
+    return pyautogui.position().x, pyautogui.position().y
+
+
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0")
diff --git a/desktop_env/server/requirements.txt b/desktop_env/server/requirements.txt
index 32c0e96..f4cb1ab 100644
--- a/desktop_env/server/requirements.txt
+++ b/desktop_env/server/requirements.txt
@@ -2,3 +2,4 @@ python3-xlib==0.15
 PyAutoGUI==0.9.54
 Pillow==10.1.0
 git+https://github.com/moses-palmer/pynput.git@refs/pull/541/head # to make sure that it works on Apple Silicon
+requests
diff --git a/main.py b/main.py
index 57c1926..9ef3982 100644
--- a/main.py
+++ b/main.py
@@ -10,13 +10,15 @@ def human_agent():
         #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
         #  host="192.168.7.128",
         host="http://192.168.13.128:5000",
+        snapshot_path="base3",
     )
 
     # reset the environment to certain snapshot
-    # observation = env.reset()
+    observation = env.reset()
     done = False
 
-    while not done:
+
+    for i in range(2):
         # action = get_human_action()
 
         # action = {
@@ -24,7 +26,7 @@ def human_agent():
         #     "click_type": 3,
         # }
 
-        action = "pyautogui.dragTo(100, 200, button='left')"
+        action = "pyautogui.moveTo(10, 100)" if i == 0 else "pyautogui.click(button='right')"
 
         observation, reward, done, info = env.step(action)
         print("Observation:", observation)