From 24169a65d0b1dfb96ea4fa18ef2a15471c205432 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 15 Jan 2024 13:49:48 +0800
Subject: [PATCH] Accomplish the exp scripts v1; Add video recording and
 trajectory recording of desktop agent; Fix minor bugs

---
 desktop_env/controllers/python.py     | 26 ++++++++++++
 desktop_env/controllers/setup.py      |  4 --
 desktop_env/evaluators/metrics/vlc.py |  6 +++
 desktop_env/server/README.md          |  5 +++
 desktop_env/server/main.py            | 49 +++++++++++++++++++++-
 experiment.py                         | 58 ++++++++++++++++++++-------
 6 files changed, 127 insertions(+), 21 deletions(-)

diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 228ead9..e466467 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -243,6 +243,32 @@ class PythonController:
         else:
             raise Exception(f"Unknown action type: {action_type}")
 
+    # Record video
+    def start_recording(self):
+        """
+        Starts recording the screen.
+        """
+        response = requests.post(self.http_server + "/start_recording")
+        if response.status_code == 200:
+            logger.info("Recording started successfully")
+        else:
+            logger.error("Failed to start recording. Status code: %d", response.status_code)
+
+    def end_recording(self, dest: str):
+        """
+        Ends recording the screen.
+        """
+        response = requests.post(self.http_server + "/end_recording")
+        if response.status_code == 200:
+            logger.info("Recording stopped successfully")
+            with open(dest, 'wb') as f:
+                for chunk in response.iter_content(chunk_size=8192):
+                    if chunk:
+                        f.write(chunk)
+        else:
+            logger.error("Failed to stop recording. Status code: %d", response.status_code)
+            return None
+
     # Additional info
     def get_vm_platform(self):
         """
diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py
index bc0f391..64b8a34 100644
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -209,10 +209,6 @@ class SetupController:
         if not command:
             raise Exception("Empty command to launch.")
 
-        if isinstance(command, str) and len(command.split()) > 1:
-            logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.")
-            command = command.split()
-
         payload = json.dumps({"command": command})
         headers = {"Content-Type": "application/json"}
 
diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py
index dd62968..ff3bbfa 100644
--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -86,6 +86,9 @@ def compare_images(image1_path, image2_path):
     # score = compare_images('path_to_image1', 'path_to_image2')
     # print("Similarity score:", score)
 
+    if not image1_path or not image2_path:
+        return 0
+
     # Open the images and convert to grayscale
     image1 = Image.open(image1_path).convert('L')
     image2 = Image.open(image2_path).convert('L')
@@ -119,6 +122,9 @@ def compare_audios(audio_path_1, audio_path_2, max_distance=1000):
     # print(f'Similarity Score: {similarity}')
 
     # Convert to common format if necessary and load audio
+    if not audio_path_1 or not audio_path_2:
+        return 0
+
     y1, sr1 = librosa.load(audio_path_1)
     y2, sr2 = librosa.load(audio_path_2)
 
diff --git a/desktop_env/server/README.md b/desktop_env/server/README.md
index 479ab82..1eacf5f 100644
--- a/desktop_env/server/README.md
+++ b/desktop_env/server/README.md
@@ -78,3 +78,8 @@ Activating the window manager control requires the installation of `wmctrl`:
 ```bash
 sudo apt install wmctrl
 ```
+
+To enable recording in the virtual machine, you need to install `ffmpeg`:
+```bash
+sudo apt install ffmpeg
+```
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index 55ebd74..5bea588 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -1,6 +1,7 @@
 import ctypes
 import os
 import platform
+import shlex
 import subprocess
 from pathlib import Path
 from typing import Any, Optional
@@ -13,7 +14,7 @@ import pyautogui
 import requests
 from PIL import Image
 from Xlib import display, X
-from flask import Flask, request, jsonify, send_file, abort
+from flask import Flask, request, jsonify, send_file, abort, send_from_directory
 from lxml.etree import _Element
 from pyatspi import Accessible, StateType
 from pyatspi import Action as ATAction
@@ -29,7 +30,8 @@ pyautogui.PAUSE = 0
 pyautogui.DARWIN_CATCH_UP_TIME = 0
 
 logger = app.logger
-
+recording_process = None  # fixme: this is a temporary solution for recording, need to be changed to support multiple-process
+recording_path = "/tmp/recording.mp4"
 
 @app.route('/setup/execute', methods=['POST'])
 @app.route('/execute', methods=['POST'])
@@ -39,6 +41,9 @@ def execute_command():
     shell = data.get('shell', False)
     command = data.get('command', "" if shell else [])
 
+    if isinstance(command, str):
+        command = shlex.split(command)
+
     # Execute the command without any safety checks.
     try:
         result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True)
@@ -60,6 +65,9 @@ def launch_app():
     data = request.json
     command: List[str] = data.get("command", [])
 
+    if isinstance(command, str):
+        command = shlex.split(command)
+
     try:
         subprocess.Popen(command)
         return "{:} launched successfully".format(" ".join(command))
@@ -604,5 +612,42 @@ def activate_window():
     return "File opened successfully", 200
 
 
+@app.route('/start_recording', methods=['POST'])
+def start_recording():
+    global recording_process
+    if recording_process:
+        return jsonify({'status': 'error', 'message': 'Recording is already in progress.'}), 400
+
+    d = display.Display()
+    screen_width = d.screen().width_in_pixels
+    screen_height = d.screen().height_in_pixels
+
+    start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}"
+
+    recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+
+    return jsonify({'status': 'success', 'message': 'Started recording.'})
+
+
+@app.route('/end_recording', methods=['POST'])
+def end_recording():
+    global recording_process
+
+    if not recording_process:
+        return jsonify({'status': 'error', 'message': 'No recording in progress to stop.'}), 400
+
+    recording_process.terminate()
+    recording_process.wait()
+    return_code = recording_process.returncode
+    output, error = recording_process.communicate()
+    recording_process = None
+
+    # return recording video file
+    if os.path.exists(recording_path):
+        return send_file(recording_path, as_attachment=True)
+    else:
+        return abort(404, description="Recording failed")
+
+
 if __name__ == '__main__':
     app.run(debug=True, host="0.0.0.0")
diff --git a/experiment.py b/experiment.py
index 1674f02..48bb1dc 100644
--- a/experiment.py
+++ b/experiment.py
@@ -44,7 +44,7 @@ logger = logging.getLogger("desktopenv.experiment")
 PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 
 
-def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_trajectory"):
+def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True):
     trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
     env = DesktopEnv(
         path_to_vm=PATH_TO_VM,
@@ -57,25 +57,53 @@ def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_tr
     done = False
     step_num = 0
 
-    # todo: save the screenshots and actions to a folder
+    if recording:
+        # send a request to the server to start recording
+        env.controller.start_recording()
+
     while not done and step_num < max_steps:
         actions = agent.predict(observation)
         for action in actions:
+            step_num += 1
+
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
             observation, reward, done, info = env.step(action)
             observation['instruction'] = example['instruction']
-        step_num += 1
-        logger.info("Step %d", step_num)
-        logger.info("Action: %s", actions)
-        observation.pop("accessibility_tree")
-        logger.info("Observation: %s", observation)
-        logger.info("Reward: %.2f", reward)
-        logger.info("Info: %s", info)
 
-        logger.info("================================\n")
+            # Logging
+            logger.info("Step %d: %s", step_num, action)
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            logger.info("Info: %s", info)
 
-        if done:
-            logger.info("The episode is done.")
-            break
+
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
+                with open(observation['screenshot'], "rb") as __f:
+                    screenshot = __f.read()
+                _f.write(screenshot)
+
+            with open(trajectory_recording_path, "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_num,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+
+            if done:
+                logger.info("The episode is done.")
+                break
+
+    if recording:
+        # send a request to the server to stop recording
+        env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
 
     result = env.evaluate()
     logger.info("Result: %.2f", result)
@@ -91,7 +119,7 @@ if __name__ == "__main__":
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
-    example["snapshot"] = "chrome_setup"
+    example["snapshot"] = "exp_setup"
 
     api_key = os.environ.get("OPENAI_API_KEY")
     agent = GPT4v_Agent(api_key=api_key, action_space=action_space)
@@ -101,4 +129,4 @@ if __name__ == "__main__":
     example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
     os.makedirs(example_trajectory_dir, exist_ok=True)
 
-    run_one_example(example, agent, 20, example_trajectory_dir)
+    run_one_example(example, agent, 2, example_trajectory_dir)