diff --git a/README.md b/README.md index 6e87a68..60d8add 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ todo ### For users of the environment todo -## Road map (Proposed) +## Road map of infra (Proposed) - [x] Explore VMWare, and whether it can be connected and control through mouse package - [x] Explore Windows and MacOS, whether it can be installed @@ -20,3 +20,7 @@ todo - [x] Set up a pipeline and build agents implementation (zero-shot) for the task - [x] Start to design on which tasks inside the DesktopENv to focus on, start to wrap up the environment to be public - [x] Start to annotate the examples for ~~training~~ and testing +- [ ] Error handling during file passing and file opening, etc. +- [ ] Add accessibility tree from the OS into the observation space +- [ ] Add pre-process and post-process action support for benchmarking setup and evaluation +- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient \ No newline at end of file diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index a960c92..a0e9adc 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -201,7 +201,7 @@ class SetupController: def _launch_setup(self, command: List[str]): if not command: - raise Exception("Empty comman to launch.") + raise Exception("Empty command to launch.") payload = json.dumps({"command": command}) headers = {"Content-Type": "application/json"} diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index b78e4b5..613dd7b 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -223,7 +223,7 @@ class DesktopEnv(gym.Env): Evaluate whether the task is successfully completed. """ - self.setup_controller.setup(self.evaluator["postconfig"]) + self.setup_controller.setup(self.evaluator["postconfig"]) if "postconfig" in self.evaluator else None result_state = self.result_getter(self, self.evaluator["result"]) expected_state = self.expected_getter(self, self.evaluator["expected"]) if "expected" in self.evaluator \ diff --git a/desktop_env/evaluators/getters/vlc.py b/desktop_env/evaluators/getters/vlc.py index e00ce61..abdb085 100644 --- a/desktop_env/evaluators/getters/vlc.py +++ b/desktop_env/evaluators/getters/vlc.py @@ -1,6 +1,9 @@ +import logging import os from typing import Dict +logger = logging.getLogger("desktopenv.getters.vlc") + def get_vlc_playing_info(env, config: Dict[str, str]): """ @@ -13,7 +16,33 @@ def get_vlc_playing_info(env, config: Dict[str, str]): password = 'password' content = env.controller.get_vlc_status(host, port, password) - print("content: ", content) + with open(_path, "wb") as f: + f.write(content) + + return _path + + +def get_vlc_config(env, config: Dict[str, str]): + """ + Reads the VLC configuration file to check setting. + """ + _path = os.path.join(env.cache_dir, config["dest"]) + + os_type = env.controller.execute_python_command("import platform; print(platform.system())")['output'].strip() + + # fixme: depends on how we config and install the vlc in virtual machine, need to be aligned and double-checked + if os_type == "Linux": + config_path = \ + env.controller.execute_python_command("import os; print(os.path.expanduser('~/snap/vlc/common/vlcrc'))")[ + 'output'].strip() + elif os_type == "Darwin": + config_path = env.controller.execute_python_command( + "import os; print(os.path.expanduser('~/Library/Preferences/org.videolan.vlc/vlcrc'))")['output'].strip() + elif os_type == "Windows": + config_path = env.controller.execute_python_command( + "import os; print(os.path.expanduser('~\\AppData\\Roaming\\vlc\\vlcrc'))")['output'].strip() + + content = env.controller.get_file(config_path) with open(_path, "wb") as f: f.write(content) diff --git a/desktop_env/evaluators/metrics/README.md b/desktop_env/evaluators/metrics/README.md index 0f7b2aa..7ca036c 100644 --- a/desktop_env/evaluators/metrics/README.md +++ b/desktop_env/evaluators/metrics/README.md @@ -130,6 +130,12 @@ To enable and use the HTTP interface in VLC Media Player for remote control and - You will be prompted for a password. Enter the password you set in the Lua HTTP settings. - Once logged in, you will have access to VLC's HTTP interface for remote control. +#### Packages +```bash + +pip install opencv-python-headless Pillow imagehash +``` + #### Troubleshooting - If you cannot access the HTTP interface, check if your firewall or security software is blocking the connection. diff --git a/desktop_env/evaluators/metrics/vlc.py b/desktop_env/evaluators/metrics/vlc.py index ba6ad45..24eddb9 100644 --- a/desktop_env/evaluators/metrics/vlc.py +++ b/desktop_env/evaluators/metrics/vlc.py @@ -1,64 +1,142 @@ -import os -import platform -from xml.etree import ElementTree -import pygetwindow as gw -import pyautogui -from typing import Dict - import logging +import os +import subprocess +from typing import Dict +from xml.etree import ElementTree + +import acoustid +import cv2 +import imagehash +import pyautogui +import pygetwindow as gw # todo: change to the library that supports Linux +from PIL import Image + logger = logging.getLogger("desktopenv.metrics.vlc") -def get_vlc_config(setting_name): - """ - Reads the VLC configuration file to check for a specific setting. - # Example usage - setting_name = 'recordings_folder=' - setting = read_vlc_config(setting_name) - """ - # Common paths for VLC config file on different operating systems - paths = { - 'Windows': os.path.expanduser('~\\AppData\\Roaming\\vlc\\vlcrc'), - 'Darwin': os.path.expanduser('~/Library/Preferences/org.videolan.vlc/vlcrc'), - 'Linux': os.path.expanduser('~/.config/vlc/vlcrc') - } - - os_type = platform.system() - config_path = paths.get(os_type) - - if not config_path or not os.path.exists(config_path): - logger.warning("VLC config file not found for this operating system.") - return None - - try: - with open(config_path, 'r', encoding="utf-8") as file: - for line in file: - if line.startswith(setting_name): - return line.strip() - except IOError as e: - logger.error(f"Error reading config file: {e}") - - return None - - -def is_vlc_playing(actual: str, rule: Dict[str, str]) -> float: +def is_vlc_playing(actual_status_path: str, rule: Dict[str, str]) -> float: """ Checks if VLC is currently playing a file. """ - with open(actual, 'rb') as file: + with open(actual_status_path, 'rb') as file: actual_status = file.read().decode('utf-8') tree = ElementTree.fromstring(actual_status) status = tree.find('state').text if status == 'playing': - file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text - print("file_info: ", file_info) - if file_info: - return 1 if file_info.endswith(rule['expected']) else 0 + if rule['type'] == 'file_name': + file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text + if file_info: + return 1 if file_info.endswith(rule['file_name']) else 0 + elif rule['type'] == 'url': + file_info = tree.find('information/category[@name="meta"]/info[@name="url"]').text + if file_info: + return 1 if file_info.endswith(rule['url']) else 0 + else: + logger.error(f"Unknown type: {rule['type']}") + return 0 else: return 0 +def is_vlc_recordings_folder(actual_config_path: str, rule: Dict[str, str]) -> float: + """ + Checks if VLC's recording folder is set to the expected value. + """ + with open(actual_config_path, 'rb') as file: + config_file = file.read().decode('utf-8') + + expected_recording_file_path = rule['recording_file_path'] + + try: + for line in config_file: + # Skip comments and empty lines + if line.startswith('#') or not line.strip(): + continue + # Check if the line contains the recording path setting + if 'recorded_files_path' in line: + # Extract the value of the recording path and remove surrounding whitespace + current_path = line.split('=')[-1].strip() + # Compare with the Desktop path + if current_path == expected_recording_file_path: + return True + else: + return False + # The configuration key was not found in the file + return False + except FileNotFoundError: + logger.error("VLC configuration file not found.") + return False + except Exception as e: + logger.error(f"An error occurred: {e}") + return False + + +def are_audio_files_similar(mp3_file_path, mp4_file_path): + # Extract audio fingerprint from MP3 file + mp3_fingerprint, mp3_duration = acoustid.fingerprint_file(mp3_file_path) + + # Extract the audio stream from the MP4 file + mp4_audio_path = os.path.splitext(mp4_file_path)[0] + '_extracted.mp3' + try: + subprocess.run(["ffmpeg", "-i", mp4_file_path, "-vn", "-ar", "44100", "-ac", "2", "-ab", "192k", "-f", "mp3", + mp4_audio_path], check=True) + except subprocess.CalledProcessError as e: + print(f"An error occurred during audio extraction from MP4: {e}") + return False + + # Extract audio fingerprint from the extracted audio + mp4_fingerprint, mp4_duration = acoustid.fingerprint_file(mp4_audio_path) + + # Clean up temporary extracted audio file + os.remove(mp4_audio_path) + + # Compare fingerprints (rudimentary comparison) + if mp3_duration >= mp4_duration and mp3_fingerprint == mp4_fingerprint: + return True + + return False + + +def compare_videos(video_path1, video_path2, max_frames_to_check=100, threshold=5): + # Open both video files + cap1 = cv2.VideoCapture(video_path1) + cap2 = cv2.VideoCapture(video_path2) + + frames_checked = 0 + mismatch_count = 0 + + while frames_checked < max_frames_to_check: + # Read frames from both videos + ret1, frame1 = cap1.read() + ret2, frame2 = cap2.read() + + # If a video ends, then check if both ended to confirm they are of the same length + if not ret1 or not ret2: + return ret1 == ret2 + + # Convert frames to PIL Images + frame1 = Image.fromarray(cv2.cvtColor(frame1, cv2.COLOR_BGR2RGB)) + frame2 = Image.fromarray(cv2.cvtColor(frame2, cv2.COLOR_BGR2RGB)) + + # Compute the perceptual hash for each frame + hash1 = imagehash.phash(frame1) + hash2 = imagehash.phash(frame2) + + # Increment the frames checked + frames_checked += 1 + + # Compute the difference in the hashes + if hash1 - hash2 > threshold: + mismatch_count += 1 + # If there's a significant difference, the frames are not the same + if mismatch_count > threshold: + return False + + # If we reach here, the content appears to be the same + return True + + def is_vlc_fullscreen(): """ Checks if the VLC window is in full-screen mode. diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py index 26fc2a4..a9bcbc7 100644 --- a/desktop_env/server/main.py +++ b/desktop_env/server/main.py @@ -1,15 +1,15 @@ import os -from pathlib import Path import platform import subprocess -import requests -from .pyxcursor import Xcursor -# import Xlib.display +from pathlib import Path +from typing import List + import pyautogui -# from PIL import ImageGrab, Image +import requests from PIL import Image from flask import Flask, request, jsonify, send_file -from typing import List + +from pyxcursor import Xcursor app = Flask(__name__) @@ -18,6 +18,7 @@ pyautogui.DARWIN_CATCH_UP_TIME = 0 logger = app.logger + @app.route('/setup/execute', methods=['POST']) @app.route('/execute', methods=['POST']) def execute_command(): @@ -40,6 +41,7 @@ def execute_command(): 'message': str(e) }), 500 + @app.route('/setup/launch', methods=["POST"]) def launch_app(): data = request.json @@ -49,11 +51,7 @@ def launch_app(): subprocess.Popen(command) return "{:} launched successfully".format(" ".join(command)) except Exception as e: - return jsonify( { "status": "error" - , "message": str(e) - } - )\ - , 500 + return jsonify({"status": "error", "message": str(e)}), 500 @app.route('/screenshot', methods=['GET']) @@ -116,6 +114,7 @@ def get_file(): # If the file is not found, return a 404 error return jsonify({"error": "File not found"}), 404 + @app.route("/setup/upload", methods=["POST"]) def upload_file(): # Retrieve filename from the POST request @@ -127,6 +126,7 @@ def upload_file(): else: return jsonify({"error": "file_path and file_data are required"}), 400 + @app.route('/platform', methods=['GET']) def get_platform(): return platform.system() diff --git a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json index 3499410..b65ddfb 100644 --- a/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json +++ b/evaluation_examples/examples/vlc/59f21cfb-0120-4326-b255-a5b827b38967.json @@ -31,7 +31,8 @@ "expected": { "type": "rule", "rules": { - "file_path": "Desktop/Rick Astley - Never Gonna Give You Up (Official Music Video).mp4" + "type": "file_name", + "file_name": "Rick Astley - Never Gonna Give You Up (Official Music Video).mp4" } }, "result": { diff --git a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json index 5dd4cb6..d59d628 100644 --- a/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json +++ b/evaluation_examples/examples/vlc/8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89.json @@ -1,12 +1,31 @@ { "id": "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89", "snapshot": "base_setup", - "instruction": "Help me modify the folder used to store my recordings to LOCAL_PATH.", + "instruction": "Help me modify the folder used to store my recordings to Desktop", "source": "https://docs.videolan.me/vlc-user/desktop/3.0/en/basic/recording/playing.html#choose-your-recordings-folder", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": "vlc" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vlc" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "is_vlc_recordings_folder", + "expected": { + "type": "rule", + "rules": { + "recording_file_path": "/home/user/Desktop" + } + }, + "result": { + "type": "vlc_config", + "dest": "vlcrc" + } + } } diff --git a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json index fe2facb..db15765 100644 --- a/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json +++ b/evaluation_examples/examples/vlc/8f080098-ddb1-424c-b438-4e96e5e4786e.json @@ -3,10 +3,40 @@ "snapshot": "base_setup", "instruction": "Could you help me extract MP3 Audio to AUDIO_PATH from Video at VIDEO_PATH using VLC Media Player?", "source": "https://medium.com/@jetscribe_ai/how-to-extract-mp3-audio-from-videos-using-vlc-media-player-beeef644ebfb", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=19jBiz8sb0M7KHHATO9qeTPr17aWm4me-&export=download&authuser=0&confirm=t&uuid=7a2261f4-3905-433f-b53f-a52dd0845651&at=APZUnTU1nmXSa1ObrA5NHYt8t1-p:1704710908141", + "path": "Baby Justin Bieber.mp4" + } + ] + } + }, + { + "type": "launch", + "parameters": { + "command": "vlc" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vlc" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "is_vlc_recordings_folder", + "expected": { + "type": "rule", + "rules": { + "recording_file_path": "/home/user/Desktop" + } + }, + "result": { + "type": "vlc_config", + "dest": "vlcrc" + } + } } diff --git a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json index 597be2e..d5c48ec 100644 --- a/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json +++ b/evaluation_examples/examples/vlc/aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6.json @@ -3,7 +3,25 @@ "snapshot": "base_setup", "instruction": "This video is upside down, help me rotate it", "source": "https://www.dedoimedo.com/computers/vlc-rotate-videos.html", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1CLBjjsjGmHlbDg1lDcxfdE0F0C7-A5gZ&export=download&authuser=0&confirm=t&uuid=dde635fc-e223-4cd3-8065-899396e68d0a&at=APZUnTWQHdWYLLxlofuOIuhE2qiS:1704722380621", + "path": "flipped_1984_Apple_Macintosh_Commercial.mp4" + } + ] + } + }, + { + "type": "launch", + "parameters": { + "command": "vlc" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vlc" diff --git a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json index 14f5e94..670aa5a 100644 --- a/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json +++ b/evaluation_examples/examples/vlc/bba3381f-b5eb-4439-bd9e-80c22218d5a7.json @@ -1,12 +1,32 @@ { "id": "bba3381f-b5eb-4439-bd9e-80c22218d5a7", "snapshot": "base_setup", - "instruction": "Could you help me play the online video at URL?(network stream)", + "instruction": "Help me play the online video at https://www.youtube.com/watch?v=pgBsyTKAwLw", "source": "https://www.quora.com/How-do-I-play-online-videos-using-the-VLC-media-player", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": "vlc" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vlc" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "is_vlc_playing", + "expected": { + "type": "rule", + "rules": { + "type": "url", + "url": "https://www.youtube.com/watch?v=pgBsyTKAwLw" + } + }, + "result": { + "type": "vlc_playing_info", + "dest": "status.xml" + } + } } diff --git a/mm_agents/gui_som/data_preparation/scrape_crawler.py b/mm_agents/gui_som/data_preparation/scrape_crawler.py index 4ba93c5..1ea43ea 100644 --- a/mm_agents/gui_som/data_preparation/scrape_crawler.py +++ b/mm_agents/gui_som/data_preparation/scrape_crawler.py @@ -17,7 +17,7 @@ with open('get_tag_elem_dict.js', 'r') as f: def scrape_data(website_url, action_depth=10): # if file exists, skip if os.path.exists(os.path.join('collected_data', website_url.split("//")[1])): - print("Data already exists, skipping...") + # print("Data already exists, skipping...") return def click_random_link(page): @@ -100,6 +100,7 @@ def scrape_data(website_url, action_depth=10): def run_one(url): try: scrape_data("https://" + url, action_depth=5) + scrape_data("http://" + url, action_depth=5) except Exception as e: print("Error scraping data:", e) print("Start next one...") @@ -107,6 +108,7 @@ def run_one(url): def main(): urls = read_csv("majestic_million.csv")[:20000] + random.shuffle(urls) # Number of processes num_processes = 50 # Adjust based on your system's capability, on my i9-13900k, 50 processes can be used