From 24169a65d0b1dfb96ea4fa18ef2a15471c205432 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Mon, 15 Jan 2024 13:49:48 +0800 Subject: [PATCH] Accomplish the exp scripts v1; Add video recording and trajectory recording of desktop agent; Fix minor bugs --- desktop_env/controllers/python.py | 26 ++++++++++++ desktop_env/controllers/setup.py | 4 -- desktop_env/evaluators/metrics/vlc.py | 6 +++ desktop_env/server/README.md | 5 +++ desktop_env/server/main.py | 49 +++++++++++++++++++++- experiment.py | 58 ++++++++++++++++++++------- 6 files changed, 127 insertions(+), 21 deletions(-) diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 228ead9..e466467 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -243,6 +243,32 @@ class PythonController: else: raise Exception(f"Unknown action type: {action_type}") + # Record video + def start_recording(self): + """ + Starts recording the screen. + """ + response = requests.post(self.http_server + "/start_recording") + if response.status_code == 200: + logger.info("Recording started successfully") + else: + logger.error("Failed to start recording. Status code: %d", response.status_code) + + def end_recording(self, dest: str): + """ + Ends recording the screen. + """ + response = requests.post(self.http_server + "/end_recording") + if response.status_code == 200: + logger.info("Recording stopped successfully") + with open(dest, 'wb') as f: + for chunk in response.iter_content(chunk_size=8192): + if chunk: + f.write(chunk) + else: + logger.error("Failed to stop recording. Status code: %d", response.status_code) + return None + # Additional info def get_vm_platform(self): """ diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index bc0f391..64b8a34 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -209,10 +209,6 @@ class SetupController: if not command: raise Exception("Empty command to launch.") - if isinstance(command, str) and len(command.split()) > 1: - logger.warning("Command should be a list of strings. Now it is a string. Will split it by space.") - command = command.split() - payload = json.dumps({"command": command}) headers = {"Content-Type": "application/json"} diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py index dd62968..ff3bbfa 100644 --- a/desktop_env/evaluators/metrics/vlc.py +++ b/desktop_env/evaluators/metrics/vlc.py @@ -86,6 +86,9 @@ def compare_images(image1_path, image2_path): # score = compare_images('path_to_image1', 'path_to_image2') # print("Similarity score:", score) + if not image1_path or not image2_path: + return 0 + # Open the images and convert to grayscale image1 = Image.open(image1_path).convert('L') image2 = Image.open(image2_path).convert('L') @@ -119,6 +122,9 @@ def compare_audios(audio_path_1, audio_path_2, max_distance=1000): # print(f'Similarity Score: {similarity}') # Convert to common format if necessary and load audio + if not audio_path_1 or not audio_path_2: + return 0 + y1, sr1 = librosa.load(audio_path_1) y2, sr2 = librosa.load(audio_path_2) diff --git a/desktop_env/server/README.md b/desktop_env/server/README.md index 479ab82..1eacf5f 100644 --- a/desktop_env/server/README.md +++ b/desktop_env/server/README.md @@ -78,3 +78,8 @@ Activating the window manager control requires the installation of `wmctrl`: ```bash sudo apt install wmctrl ``` + +To enable recording in the virtual machine, you need to install `ffmpeg`: +```bash +sudo apt install ffmpeg +``` diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py index 55ebd74..5bea588 100644 --- a/desktop_env/server/main.py +++ b/desktop_env/server/main.py @@ -1,6 +1,7 @@ import ctypes import os import platform +import shlex import subprocess from pathlib import Path from typing import Any, Optional @@ -13,7 +14,7 @@ import pyautogui import requests from PIL import Image from Xlib import display, X -from flask import Flask, request, jsonify, send_file, abort +from flask import Flask, request, jsonify, send_file, abort, send_from_directory from lxml.etree import _Element from pyatspi import Accessible, StateType from pyatspi import Action as ATAction @@ -29,7 +30,8 @@ pyautogui.PAUSE = 0 pyautogui.DARWIN_CATCH_UP_TIME = 0 logger = app.logger - +recording_process = None # fixme: this is a temporary solution for recording, need to be changed to support multiple-process +recording_path = "/tmp/recording.mp4" @app.route('/setup/execute', methods=['POST']) @app.route('/execute', methods=['POST']) @@ -39,6 +41,9 @@ def execute_command(): shell = data.get('shell', False) command = data.get('command', "" if shell else []) + if isinstance(command, str): + command = shlex.split(command) + # Execute the command without any safety checks. try: result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True) @@ -60,6 +65,9 @@ def launch_app(): data = request.json command: List[str] = data.get("command", []) + if isinstance(command, str): + command = shlex.split(command) + try: subprocess.Popen(command) return "{:} launched successfully".format(" ".join(command)) @@ -604,5 +612,42 @@ def activate_window(): return "File opened successfully", 200 +@app.route('/start_recording', methods=['POST']) +def start_recording(): + global recording_process + if recording_process: + return jsonify({'status': 'error', 'message': 'Recording is already in progress.'}), 400 + + d = display.Display() + screen_width = d.screen().width_in_pixels + screen_height = d.screen().height_in_pixels + + start_command = f"ffmpeg -y -f x11grab -draw_mouse 1 -s {screen_width}x{screen_height} -i :0.0 -c:v libx264 -r 30 {recording_path}" + + recording_process = subprocess.Popen(shlex.split(start_command), stdout=subprocess.PIPE, stderr=subprocess.PIPE) + + return jsonify({'status': 'success', 'message': 'Started recording.'}) + + +@app.route('/end_recording', methods=['POST']) +def end_recording(): + global recording_process + + if not recording_process: + return jsonify({'status': 'error', 'message': 'No recording in progress to stop.'}), 400 + + recording_process.terminate() + recording_process.wait() + return_code = recording_process.returncode + output, error = recording_process.communicate() + recording_process = None + + # return recording video file + if os.path.exists(recording_path): + return send_file(recording_path, as_attachment=True) + else: + return abort(404, description="Recording failed") + + if __name__ == '__main__': app.run(debug=True, host="0.0.0.0") diff --git a/experiment.py b/experiment.py index 1674f02..48bb1dc 100644 --- a/experiment.py +++ b/experiment.py @@ -44,7 +44,7 @@ logger = logging.getLogger("desktopenv.experiment") PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" -def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_trajectory"): +def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True): trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") env = DesktopEnv( path_to_vm=PATH_TO_VM, @@ -57,25 +57,53 @@ def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_tr done = False step_num = 0 - # todo: save the screenshots and actions to a folder + if recording: + # send a request to the server to start recording + env.controller.start_recording() + while not done and step_num < max_steps: actions = agent.predict(observation) for action in actions: + step_num += 1 + + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + observation, reward, done, info = env.step(action) observation['instruction'] = example['instruction'] - step_num += 1 - logger.info("Step %d", step_num) - logger.info("Action: %s", actions) - observation.pop("accessibility_tree") - logger.info("Observation: %s", observation) - logger.info("Reward: %.2f", reward) - logger.info("Info: %s", info) - logger.info("================================\n") + # Logging + logger.info("Step %d: %s", step_num, action) + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) - if done: - logger.info("The episode is done.") - break + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) result = env.evaluate() logger.info("Result: %.2f", result) @@ -91,7 +119,7 @@ if __name__ == "__main__": with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: example = json.load(f) - example["snapshot"] = "chrome_setup" + example["snapshot"] = "exp_setup" api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, action_space=action_space) @@ -101,4 +129,4 @@ if __name__ == "__main__": example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) os.makedirs(example_trajectory_dir, exist_ok=True) - run_one_example(example, agent, 20, example_trajectory_dir) + run_one_example(example, agent, 2, example_trajectory_dir)