diff --git a/README.md b/README.md index 60d8add..b7d56df 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ todo - [x] Set up a pipeline and build agents implementation (zero-shot) for the task - [x] Start to design on which tasks inside the DesktopENv to focus on, start to wrap up the environment to be public - [x] Start to annotate the examples for ~~training~~ and testing -- [ ] Error handling during file passing and file opening, etc. -- [ ] Add accessibility tree from the OS into the observation space +- [x] Error handling during file passing and file opening, etc. +- [x] Add accessibility tree from the OS into the observation space - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation - [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient \ No newline at end of file diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py index 081b76d..724fa40 100644 --- a/desktop_env/controllers/python.py +++ b/desktop_env/controllers/python.py @@ -197,8 +197,10 @@ class PythonController: if "text" not in parameters: raise Exception(f"Unknown parameters: {parameters}") # deal with special ' and \ characters - text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'") - self.execute_python_command(f"pyautogui.typewrite('{text}')") + # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'") + # self.execute_python_command(f"pyautogui.typewrite('{text}')") + text = parameters["text"] + self.execute_python_command("pyautogui.typewrite({:})".format(repr(text))) elif action_type == "PRESS": if "key" not in parameters: @@ -237,6 +239,9 @@ class PythonController: keys_para_rep = "', '".join(keys) self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')") + elif action_type in ['WAIT', 'FAIL', 'DONE']: + pass + else: raise Exception(f"Unknown action type: {action_type}") @@ -280,3 +285,31 @@ class PythonController: else: logger.error("Failed to get wallpaper. Status code: %d", response.status_code) return None + + def get_vm_desktop_path(self): + """ + Gets the desktop path of the vm. + """ + response = requests.post(self.http_server + "/desktop_path") + if response.status_code == 200: + logger.info("Desktop path downloaded successfully") + return response.json()["desktop_path"] + else: + logger.error("Failed to get desktop path. Status code: %d", response.status_code) + return None + + def get_vm_directory_tree(self, path): + """ + Gets the directory tree of the vm. + """ + payload = json.dumps({"path": path}) + headers = { + 'Content-Type': 'application/json' + } + response = requests.post(self.http_server + "/list_directory", headers=headers, data=payload) + if response.status_code == 200: + logger.info("Directory tree downloaded successfully") + return response.json()["directory_tree"] + else: + logger.error("Failed to get directory tree. Status code: %d", response.status_code) + return None \ No newline at end of file diff --git a/desktop_env/controllers/setup.py b/desktop_env/controllers/setup.py index cd42003..13d4ee8 100644 --- a/desktop_env/controllers/setup.py +++ b/desktop_env/controllers/setup.py @@ -1,18 +1,18 @@ import json -import time +import logging import os.path +import time import traceback import uuid - -from typing import Dict, List from typing import Any, Union, Optional +from typing import Dict, List import requests from playwright.sync_api import sync_playwright from requests_toolbelt.multipart.encoder import MultipartEncoder + from desktop_env.evaluators.metrics.utils import compare_urls -import logging logger = logging.getLogger("desktopenv.setup") @@ -20,6 +20,7 @@ class SetupController: def __init__(self, vm_ip: str, cache_dir: str): self.vm_ip: str = vm_ip self.http_server: str = f"http://{vm_ip}:5000" + self.http_server_setup_root: str = f"http://{vm_ip}:5000/setup" self.cache_dir: str = cache_dir def reset_cache_dir(self, cache_dir: str): @@ -57,31 +58,31 @@ class SetupController: # can add other setup steps # ZDY_COMMENT: merged with launch - #def _command_setup(self, command: str): - #""" - #Directly send a command into the virtual machine os for setting up. - #""" - #payload = json.dumps({"command": command}) - #headers = { - #'Content-Type': 'application/json' - #} - #timeout = 5 - #timout_whitelist = ["vlc"] -# - #try: -# - #response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout) - #if response.status_code == 200: - #print("Command executed successfully:", response.text) - #else: - #print("Failed to execute command. Status code:", response.status_code) - #except requests.exceptions.Timeout as e: - #if command in timout_whitelist: - #print("Command executed successfully:", command) - #else: - #print("An error occurred while trying to execute the command:", e) - #except requests.exceptions.RequestException as e: - #print("An error occurred while trying to execute the command:", e) + # def _command_setup(self, command: str): + # """ + # Directly send a command into the virtual machine os for setting up. + # """ + # payload = json.dumps({"command": command}) + # headers = { + # 'Content-Type': 'application/json' + # } + # timeout = 5 + # timout_whitelist = ["vlc"] + # + # try: + # + # response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout) + # if response.status_code == 200: + # print("Command executed successfully:", response.text) + # else: + # print("Failed to execute command. Status code:", response.status_code) + # except requests.exceptions.Timeout as e: + # if command in timout_whitelist: + # print("Command executed successfully:", command) + # else: + # print("An error occurred while trying to execute the command:", e) + # except requests.exceptions.RequestException as e: + # print("An error occurred while trying to execute the command:", e) def _download_setup(self, files: List[Dict[str, str]]): """ @@ -224,9 +225,14 @@ class SetupController: except requests.exceptions.RequestException as e: logger.error("An error occurred while trying to send the request: %s", e) - def _execute_setup( self, command: List[str] - , stdout: str = "", stderr: str = "" - , shell: bool = False, until: Optional[Dict[str, Any]] = None): + def _execute_setup( + self, + command: List[str], + stdout: str = "", + stderr: str = "", + shell: bool = False, + until: Optional[Dict[str, Any]] = None + ): if not command: raise Exception("Empty comman to launch.") @@ -248,10 +254,10 @@ class SetupController: if stderr: with open(os.path.join(self.cache_dir, stderr), "w") as f: f.write(results["error"]) - logger.info( "Command executed successfully: %s -> %s" - , " ".join(command) - , response.text - ) + logger.info("Command executed successfully: %s -> %s" + , " ".join(command) + , response.text + ) else: logger.error("Failed to launch application. Status code: %s", response.text) results = None @@ -263,13 +269,13 @@ class SetupController: results = None nb_failings += 1 - if len(until)==0: + if len(until) == 0: terminates = True elif results is not None: - terminates = "returncode" in until and results["returncode"]==until["returncode"]\ - or "stdout" in until and until["stdout"] in results["output"]\ - or "stderr" in until and until["stderr"] in results["error"] - terminates = terminates or nb_failings>=5 + terminates = "returncode" in until and results["returncode"] == until["returncode"] \ + or "stdout" in until and until["stdout"] in results["output"] \ + or "stderr" in until and until["stderr"] in results["error"] + terminates = terminates or nb_failings >= 5 if not terminates: time.sleep(0.3) @@ -292,6 +298,25 @@ class SetupController: # TODO raise NotImplementedError() + def _activate_window_setup(self, window_name: str): + if not window_name: + raise Exception(f"Setup Open - Invalid path ({window_name}).") + + payload = json.dumps({"window_name": window_name}) + headers = { + 'Content-Type': 'application/json' + } + + # send request to server to open file + try: + response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload) + if response.status_code == 200: + logger.info("Command executed successfully: %s", response.text) + else: + logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text) + except requests.exceptions.RequestException as e: + logger.error("An error occurred while trying to send the request: %s", e) + # Chrome setup def _chrome_open_tabs_setup(self, urls_to_open: List[str]): host = self.vm_ip diff --git a/desktop_env/envs/actions.py b/desktop_env/envs/actions.py index e03ccf0..5e286c5 100644 --- a/desktop_env/envs/actions.py +++ b/desktop_env/envs/actions.py @@ -186,5 +186,18 @@ ACTION_SPACE = [ "optional": False, } } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", } ] diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py index c7a2fee..e2ef08b 100644 --- a/desktop_env/envs/desktop_env.py +++ b/desktop_env/envs/desktop_env.py @@ -1,28 +1,30 @@ from __future__ import annotations +import logging import os import subprocess +import tempfile import time +from typing import Callable, Any, Optional # import uuid # import platform from typing import List, Dict -from typing import Callable, Any, Optional -import tempfile import gymnasium as gym -# import requests from desktop_env.controllers.python import PythonController from desktop_env.controllers.setup import SetupController # from desktop_env.evaluators import eval_funcs from desktop_env.evaluators import metrics, getters -import logging +# import requests + logger = logging.getLogger("desktopenv.env") Metric = Callable[[Any, Any], float] Getter = Callable[[gym.Env, Dict[str, Any]], Any] + def _execute_command(command: List[str]) -> None: if command[:4] == ["vmrun", "-T", "ws", "start"]: p = subprocess.Popen(command) @@ -84,8 +86,8 @@ class DesktopEnv(gym.Env): self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir) # Meta info of the VM, move to the reset() function - self.vm_platform: str = "" # self.controller.get_vm_platform() - self.vm_screen_size = None # self.controller.get_vm_screen_size() + self.vm_platform: str = "" # self.controller.get_vm_platform() + self.vm_screen_size = None # self.controller.get_vm_screen_size() # mode: human or machine assert action_space in ["computer_13", "pyautogui"] @@ -164,7 +166,7 @@ class DesktopEnv(gym.Env): self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None self.metric_options: Dict[str, Any] = self.evaluator.get("options", {}) - def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None): + def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]: logger.info("Resetting environment...") logger.info("Switching task...") @@ -202,11 +204,27 @@ class DesktopEnv(gym.Env): time.sleep(5) logger.info("Environment setup complete.") - observation = self._get_obs() + observation = {"screenshot": self._get_obs()} return observation def step(self, action, pause=0.5): self._step_no += 1 + self.action_history.append(action) + + reward = 0 # todo: Define reward calculation for each example + done = False # todo: Define episode termination condition for each example + info = {} + + # handle the special actions + if action in ['WAIT', 'FAIL', 'DONE']: + if action == 'WAIT': + time.sleep(pause) + elif action == 'FAIL': + done = True + info = {"fail": True} + elif action == 'DONE': + done = True + info = {"done": True} # fixme: add reminding logic here, decide if the action is valid for the current action_space if self.action_space == "computer_13": @@ -215,18 +233,14 @@ class DesktopEnv(gym.Env): elif self.action_space == "pyautogui": # the set of all possible python commands insides `pyautogui` self.controller.execute_python_command(action) - self.action_history.append(action) - # todo: maybe for the better here we need to add a logic to wait until the rendering is done - time.sleep(pause) observation = { "screenshot": self._get_obs(), + "accessibility_tree": self.controller.get_accessibility_tree(), "terminal": self.controller.get_terminal_output(), "instruction": self.instruction } - reward = 0 # todo: Define reward calculation for each example - done = False # todo: Define episode termination condition for each example - info = {} + return observation, reward, done, info def evaluate(self): diff --git a/desktop_env/evaluators/getters/__init__.py b/desktop_env/evaluators/getters/__init__.py index 40b1726..7e472f4 100644 --- a/desktop_env/evaluators/getters/__init__.py +++ b/desktop_env/evaluators/getters/__init__.py @@ -1,5 +1,9 @@ +from .chrome import get_default_search_engine, get_cookie_data, get_bookmarks, get_open_tabs_info, get_pdf_from_url, \ + get_shortcuts_on_desktop from .file import get_cloud_file, get_vm_file, get_cache_file +from .general import get_vm_command_line from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper from .misc import get_rule, get_accessibility_tree +from .replay import get_replay from .vlc import get_vlc_playing_info, get_vlc_config -from .chrome import get_default_search_engine, get_bookmarks, get_open_tabs_info +from .vscode import get_vscode_config diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index 62838d7..1b77016 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -46,6 +46,10 @@ def get_default_search_engine(env, config: Dict[str, str]): def get_cookie_data(env, config: Dict[str, str]): + """ + Get the cookies from the Chrome browser. + Assume the cookies are stored in the default location, not encrypted and not large in size. + """ os_type = env.vm_platform if os_type == 'Windows': chrome_cookie_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'), @@ -61,21 +65,23 @@ def get_cookie_data(env, config: Dict[str, str]): else: raise Exception('Unsupported operating system') - # todo: add a new controller function to connect the cookie database - ############# try: - conn = sqlite3.connect(chrome_cookie_file_path) + content = env.controller.get_file(chrome_cookie_file_path) + _path = os.path.join(env.cache_dir, config["dest"]) + + with open(_path, "wb") as f: + f.write(content) + + conn = sqlite3.connect(_path) cursor = conn.cursor() # Query to check for OpenAI cookies cursor.execute("SELECT * FROM cookies") cookies = cursor.fetchall() - return cookies except Exception as e: logger.error(f"Error: {e}") return None - ############# def get_bookmarks(env, config: Dict[str, str]): @@ -94,17 +100,12 @@ def get_bookmarks(env, config: Dict[str, str]): else: raise Exception('Unsupported operating system') - try: - content = env.controller.get_file(preference_file_path) - # make content json variable - data = json.load(content) - - bookmarks = data.get('roots', {}) - return bookmarks - - except Exception as e: - logger.error(f"Error: {e}") - return None + content = env.controller.get_file(preference_file_path) + if not content: + return [] + data = json.loads(content) + bookmarks = data.get('roots', {}) + return bookmarks # todo: move this to the main.py @@ -190,3 +191,83 @@ def get_active_tab_info(env, config: Dict[str, str]): browser.close() return active_tab_info + + +def get_pdf_from_url(env, config: Dict[str, str]) -> str: + """ + Download a PDF from a URL. + """ + _url = config["path"] + _path = os.path.join(env.cache_dir, config["dest"]) + + host = env.vm_ip + port = 9222 # fixme: this port is hard-coded, need to be changed from config file + + remote_debugging_url = f"http://{host}:{port}" + + with sync_playwright() as p: + browser = p.chromium.connect_over_cdp(remote_debugging_url) + page = browser.new_page() + page.goto(_url) + page.pdf(path=_path) + browser.close() + + return _path + + +# fixme: needs to be changed (maybe through post-processing) since it's not working +def get_chrome_saved_address(env, config: Dict[str, str]): + # host = env.vm_ip + host = "192.168.13.130" + port = 9222 # fixme: this port is hard-coded, need to be changed from config file + + remote_debugging_url = f"http://{host}:{port}" + with sync_playwright() as p: + # connect to remote Chrome instance + browser = p.chromium.connect_over_cdp(remote_debugging_url) + + page = browser.new_page() + + # Navigate to Chrome's settings page for autofill + page.goto("chrome://settings/addresses") + + # Get the HTML content of the page + content = page.content() + + browser.close() + + return content + + +def get_shortcuts_on_desktop(env, config: Dict[str, str]): + # Find out the operating system + os_name = env.vm_platform + + # Depending on the OS, define the shortcut file extension + if os_name == 'Windows': + # Windows shortcuts are typically .url or .lnk files + shortcut_extension = '.lnk' + elif os_name == 'Darwin': + # macOS's shortcuts are .webloc files + shortcut_extension = '.webloc' + elif os_name == 'Linux': + # Linux (Ubuntu, etc.) shortcuts are typically .desktop files + shortcut_extension = '.desktop' + else: + logger.error(f"Unsupported operating system: {os_name}") + return [] + + # Get the path to the desktop folder + desktop_path = env.controller.get_vm_desktop_path() + desktop_directory_tree = env.controller.get_vm_directory_tree(desktop_path) + + shortcuts_paths = [file['name'] for file in desktop_directory_tree['children'] if + file['name'].endswith(shortcut_extension)] + + short_cuts = {} + + for shortcut_path in shortcuts_paths: + short_cuts[shortcut_path] = env.controller.get_file(env.controller.execute_python_command( + f"import os; print(os.path.join(os.path.expanduser('~'), 'Desktop', '{shortcut_path}'))")['output'].strip()).decode('utf-8') + + return short_cuts diff --git a/desktop_env/evaluators/getters/file.py b/desktop_env/evaluators/getters/file.py index 606fead..6714b0e 100644 --- a/desktop_env/evaluators/getters/file.py +++ b/desktop_env/evaluators/getters/file.py @@ -40,7 +40,7 @@ def get_vm_file(env, config: Dict[str, str]) -> Optional[str]: file = env.controller.get_file(config["path"]) if file is None: return None - #raise FileNotFoundError("File not found on VM: {:}".format(config["path"])) + # raise FileNotFoundError("File not found on VM: {:}".format(config["path"])) with open(_path, "wb") as f: f.write(file) diff --git a/desktop_env/evaluators/getters/general.py b/desktop_env/evaluators/getters/general.py index 22a20c7..e4e4c99 100644 --- a/desktop_env/evaluators/getters/general.py +++ b/desktop_env/evaluators/getters/general.py @@ -1,23 +1,19 @@ +import logging from typing import Dict - -import os import requests +logger = logging.getLogger("desktopenv.getters.general") -def get_string(env, config: Dict[str, str]) -> str: - """ - Config: - string (str) - """ - return config["string"] +def get_vm_command_line(env, config: Dict[str, str]): + vm_ip = env.vm_ip + port = 5000 + command = config["command"] -def get_command_line(env, config: Dict[str, str]) -> str: - """ - Config: - string (str) - """ - - f = os.popen(config["command"]) - - return f.read() \ No newline at end of file + response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command}) + + if response.status_code == 200: + return response.json()["output"] + else: + logger.error("Failed to get vm command line. Status code: %d", response.status_code) + return None diff --git a/desktop_env/evaluators/getters/misc.py b/desktop_env/evaluators/getters/misc.py index f4c7bf2..b6b933a 100644 --- a/desktop_env/evaluators/getters/misc.py +++ b/desktop_env/evaluators/getters/misc.py @@ -1,6 +1,5 @@ import logging from typing import TypeVar -#from typing import Dict, List logger = logging.getLogger("desktopenv.getters.misc") @@ -13,6 +12,7 @@ def get_rule(env, config: R) -> R: """ return config["rules"] + def get_accessibility_tree(env, *args) -> str: accessibility_tree: str = env.controller.get_accessibility_tree() logger.debug("AT@eval: %s", accessibility_tree) diff --git a/desktop_env/evaluators/getters/replay.py b/desktop_env/evaluators/getters/replay.py index e69de29..c850986 100644 --- a/desktop_env/evaluators/getters/replay.py +++ b/desktop_env/evaluators/getters/replay.py @@ -0,0 +1,20 @@ +from typing import List, Dict, Any + + +def get_replay(env, trajectory: List[Dict[str, Any]]) -> None: + # fixme: need to be combined with the accessibility tree to activate the selection of the target window + def parse(action): + if action["type"] == "hotkey": + keys = "', '".join(action["param"]) + return f"pyautogui.hotkey('{keys}')" + + if action["type"] == "typewrite": + text = action["param"] + return f"pyautogui.typewrite('{text}')" + + if action["type"] == "press": + key = action["param"] + return f"pyautogui.press('{key}')" + + for action in trajectory: + env.controller.execute_python_command(parse(action)) diff --git a/desktop_env/evaluators/getters/vscode.py b/desktop_env/evaluators/getters/vscode.py new file mode 100644 index 0000000..8a725ef --- /dev/null +++ b/desktop_env/evaluators/getters/vscode.py @@ -0,0 +1,34 @@ +import logging +from typing import Any, Dict + +from .file import get_vm_file +from .replay import get_replay + +logger = logging.getLogger("desktopenv.getters.vscode") + + +def get_vscode_config(env, config: Dict[str, Any]) -> str: + os_type = env.vm_platform + vscode_extension_command = config["vscode_extension_command"] + + # fixme: depends on how we config and install the vscode in virtual machine, need to be aligned and double-checked + + if os_type == "MacOS": + trajectory = [ + {"type": "hotkey", "param": ["command", "shift", "p"]}, + {"type": "typewrite", "param": vscode_extension_command}, + {"type": "press", "param": "enter"} + ] + else: + trajectory = [ + {"type": "hotkey", "param": ["ctrl", "shift", "p"]}, + {"type": "typewrite", "param": vscode_extension_command}, + {"type": "press", "param": "enter"} + ] + + get_replay(env, trajectory) + + return get_vm_file(env, { + "path": config["path"], + "dest": config["dest"] + }) diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py index 05ff6c2..1494872 100644 --- a/desktop_env/evaluators/metrics/__init__.py +++ b/desktop_env/evaluators/metrics/__init__.py @@ -1,4 +1,4 @@ -from .chrome import is_expected_tabs, is_expected_bookmarks +from .chrome import is_expected_tabs, is_expected_bookmarks, compare_pdfs, is_cookie_deleted, is_shortcut_on_desktop from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, \ compare_insert_equation @@ -13,4 +13,5 @@ from .vlc import is_vlc_playing, is_vlc_recordings_folder, is_vlc_fullscreen, co from .gimp import increase_saturation, decrease_brightness, check_file_exists, compare_triangle_positions from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3 from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter - +from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed +from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines diff --git a/desktop_env/evaluators/metrics/chrome.py b/desktop_env/evaluators/metrics/chrome.py index dc2bcdc..78afac9 100644 --- a/desktop_env/evaluators/metrics/chrome.py +++ b/desktop_env/evaluators/metrics/chrome.py @@ -1,5 +1,9 @@ import logging from typing import Any, Dict, List + +import fitz # PyMuPDF +import rapidfuzz.fuzz as fuzz + from desktop_env.evaluators.metrics.utils import are_lists_equal, compare_urls logger = logging.getLogger("desktopenv.metrics.chrome") @@ -22,18 +26,72 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f return 0 -def is_expected_bookmarks(bookmarks: List[Dict[str, Any]], rule: Dict[str, Any]) -> float: +def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float: """ Checks if the expected bookmarks are in Chrome. """ - - # todo - match_type = rule['type'] - - if match_type == "url": - expected_urls = rule['urls'] - actual_urls = [bookmark['url'] for bookmark in bookmarks] - return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0 + if not bookmarks: + return 0. + elif rule['type'] == "bookmark_bar_folders_names": + bookmark_bar_folders_names = [bookmark['name'] for bookmark in bookmarks['bookmark_bar']['children'] if + bookmark['type'] == 'folder'] + return 1. if set(bookmark_bar_folders_names) == set(rule['names']) else 0. + elif rule['type'] == "bookmark_bar_websites_urls": + bookmark_bar_websites_urls = [bookmark['url'] for bookmark in bookmarks['bookmark_bar']['children'] if + bookmark['type'] == 'url'] + return 1. if set(bookmark_bar_websites_urls) == set(rule['urls']) else 0. else: - logger.error(f"Unknown type: {match_type}") - return 0 + raise TypeError(f"{rule['type']} not support yet!") + + +def compare_pdfs(pdf1_path, pdf2_path): + """ + Compare two PDF files. + """ + + def extract_text_from_pdf(pdf_path): + """Extract text from each page of the PDF.""" + text = "" + with fitz.open(pdf_path) as pdf: + for page in pdf: + text += page.get_text() + return text.strip() + + text1 = extract_text_from_pdf(pdf1_path) + text2 = extract_text_from_pdf(pdf2_path) + + return fuzz.ratio(text1, text2) / 100 + + +def is_cookie_deleted(cookie_data, rule): + """ + Check if the cookie is deleted. + """ + + if rule['type'] == 'domains': + cookies_domains = [cookie[1] for cookie in cookie_data] + for domain in rule['domains']: + for cookies_domain in cookies_domains: + if compare_urls(domain, cookies_domain): + return 0. + return 1. + else: + raise TypeError(f"{rule['type']} not support yet!") + + +def is_shortcut_on_desktop(shortcuts: Dict[str, str], rule): + """ + Check if the shortcut is on the desktop. + """ + # fixme: if the name of the website changed in the future, this will not work; can be replaced with url + if rule['type'] == 'name': + for shortcut_path, shortcut_content in shortcuts.items(): + if "Name=" + rule['name'] + "\n" in shortcut_content: + return 1. + return 0. + elif rule['type'] == 'url': + raise TypeError(f"{rule['type']} not support yet!") + elif rule['type'] == 'id': + raise TypeError(f"{rule['type']} not support yet!") + else: + raise TypeError(f"{rule['type']} not support yet!") diff --git a/desktop_env/evaluators/metrics/docs.py b/desktop_env/evaluators/metrics/docs.py index 48d2c03..9f2d940 100644 --- a/desktop_env/evaluators/metrics/docs.py +++ b/desktop_env/evaluators/metrics/docs.py @@ -1,12 +1,14 @@ -import xml.etree.ElementTree as ET +import logging import os +import xml.etree.ElementTree as ET from typing import List, Dict, Any + from docx import Document from docx.enum.text import WD_PARAGRAPH_ALIGNMENT -import logging logger = logging.getLogger("desktopenv.metric.docs") + def find_default_font(config_file_path, rules): """Find the default font in LibreOffice Writer.""" default_font = None diff --git a/desktop_env/evaluators/metrics/impress.py b/desktop_env/evaluators/metrics/impress.py index 641e3e4..7268aef 100644 --- a/desktop_env/evaluators/metrics/impress.py +++ b/desktop_env/evaluators/metrics/impress.py @@ -1,4 +1,75 @@ from pptx import Presentation +import os + +def is_red_color(color): + #judge if the color is red + print(color.rgb) + return color and color.rgb == (255, 0, 0) + +def get_master_placeholder_color(prs): + # get the color of the placeholder + masters = prs.slide_masters + for idx, master in enumerate(masters): + for placeholder in master.placeholders: + if placeholder.has_text_frame and placeholder.text == "": + text_frame = placeholder.text_frame + + if text_frame.paragraphs: + first_paragraph = text_frame.paragraphs[0] + return first_paragraph.font.color + return None + + +def check_slide_numbers_color(pptx_file_path): + presentation = Presentation(pptx_file_path) + + for i, slide in enumerate(presentation.slides): + for shape in slide.shapes: + # check if the shape is a text box + if hasattr(shape, "text"): + if shape.text.isdigit(): + # "SlidePlaceholder" is the name of the placeholder in the master slide + page_number_text = shape.text + font_color = get_master_placeholder_color(presentation) + print(font_color) + return 1 if font_color is not None and is_red_color(font_color) else 0 + +def compare_pptx_files(file1_path, file2_path): + prs1 = Presentation(file1_path) + prs2 = Presentation(file2_path) + + # compare the number of slides + if len(prs1.slides) != len(prs2.slides): + return 0 + + # compare the content of each slide + for slide1, slide2 in zip(prs1.slides, prs2.slides): + # check if the shapes are the same + for shape1, shape2 in zip(slide1.shapes, slide2.shapes): + if hasattr(shape1, "text") and hasattr(shape2, "text"): + if shape1.text != shape2.text: + return 0 + return 1 + +def has_two_lines_on_page(slide): + line_count = 0 + for shape in slide.shapes: + if shape.shape_type == 1: # 1 表示 Line 形状 + line_count += 1 + if line_count >= 2: + return True + return False + +def check_for_two_lines(prs): + prs = Presentation(prs) + for i, slide in enumerate(prs.slides): + if has_two_lines_on_page(slide): + return 1 + return 0 + +def check_file_exists(directory, filename): + file_path = os.path.join(directory, filename) + return 1 if os.path.isfile(file_path) else 0 if __name__ == "__main__": path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx" diff --git a/desktop_env/evaluators/metrics/libreoffice.py b/desktop_env/evaluators/metrics/libreoffice.py index 4ca07de..441d932 100644 --- a/desktop_env/evaluators/metrics/libreoffice.py +++ b/desktop_env/evaluators/metrics/libreoffice.py @@ -1,37 +1,38 @@ -import lxml.cssselect -from lxml.etree import _Element as Element -import lxml.etree import fnmatch - from typing import Dict, List +import lxml.cssselect +import lxml.etree +from lxml.etree import _Element as Element + _libconf_namespaces = [("oor", "http://openoffice.org/2001/registry")] _libconf_ns_mapping = dict(_libconf_namespaces) -_setup_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value' - , namespaces=_libconf_ns_mapping - ) -_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooLocale]>value' - , namespaces=_libconf_ns_mapping - ) +_setup_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value', + namespaces=_libconf_ns_mapping) +_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooLocale]>value', + namespaces=_libconf_ns_mapping) + + def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float: config: Element = lxml.etree.parse(config_file).getroot() setup_locale_setting: List[Element] = _setup_locale_selector(config) locale_setting: List[Element] = _locale_selector(config) - setup_locale_setting: str = setup_locale_setting[0].text\ - if len(setup_locale_setting)>0\ - else locale_setting[0].text + setup_locale_setting: str = setup_locale_setting[0].text \ + if len(setup_locale_setting) > 0 \ + else locale_setting[0].text - return float( any( fnmatch.fnmatchcase(setup_locale_setting, ptn)\ - for ptn in rules["locale_set"] + return float(any(fnmatch.fnmatchcase(setup_locale_setting, ptn) \ + for ptn in rules["locale_set"] ) - ) + ) + if __name__ == "__main__": path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu" - print( check_libre_locale( path1, { "locale_set": [ "ru-*", "de-*", "fr-*" - , "pt-*", "es-*", "it-*" - ] - } + print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*" + , "pt-*", "es-*", "it-*" + ] + } ) - ) + ) diff --git a/desktop_env/evaluators/metrics/pdf.py b/desktop_env/evaluators/metrics/pdf.py index 51c79f3..d607733 100644 --- a/desktop_env/evaluators/metrics/pdf.py +++ b/desktop_env/evaluators/metrics/pdf.py @@ -1,13 +1,11 @@ -from pypdf import PdfReader import operator - -from typing import Dict from typing import Any +from typing import Dict + +from pypdf import PdfReader + def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float: reader = PdfReader(pdf_file) nb_pages: int = len(reader.pages) - return float( getattr(operator, rules["relation"])( nb_pages - , rules["ref_value"] - ) - ) + return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"])) diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py index 25b55f3..03daef5 100644 --- a/desktop_env/evaluators/metrics/table.py +++ b/desktop_env/evaluators/metrics/table.py @@ -1,18 +1,19 @@ -import pandas as pd +import logging +import operator +from numbers import Number +from typing import Any, Union +from typing import Dict, List + import openpyxl +import pandas as pd from openpyxl import Workbook from openpyxl.worksheet.worksheet import Worksheet from .utils import load_charts, load_sparklines -import operator -from typing import Dict, List -from typing import Any, Union -from numbers import Number - -import logging logger = logging.getLogger("desktopenv.metric.table") + def compare_table(actual: str, expected: str, **options) -> float: """ Args: @@ -44,28 +45,28 @@ def compare_table(actual: str, expected: str, **options) -> float: workbook1: Workbook = openpyxl.load_workbook(actual) workbook2: Workbook = openpyxl.load_workbook(expected) - if ftr=="sparkline": + if ftr == "sparkline": sp1 = load_sparklines(actual) sp2 = load_sparklines(expected) new_metric: bool = sp1 == sp2 logger.debug("Sparkline Metric: {:}".format(new_metric)) - elif ftr=="chart": + elif ftr == "chart": charts1 = load_charts(workbook1, **options) charts2 = load_charts(workbook2, **options) new_metric: bool = charts1 == charts2 logger.debug("Chart Metric: {:}".format(new_metric)) - elif ftr=="number_format": - number_formats1: List[str] = [ c.number_format.lower()\ - for col in workbook1.active.iter_cols()\ - for c in col\ - if c.data_type=="n" - ] - number_formats2: List[str] = [ c.number_format.lower()\ - for col in workbook2.active.iter_cols()\ - for c in col\ - if c.data_type=="n" - ] - new_metric: bool = number_formats1==number_formats2 + elif ftr == "number_format": + number_formats1: List[str] = [c.number_format.lower() \ + for col in workbook1.active.iter_cols() \ + for c in col \ + if c.data_type == "n" + ] + number_formats2: List[str] = [c.number_format.lower() \ + for col in workbook2.active.iter_cols() \ + for c in col \ + if c.data_type == "n" + ] + new_metric: bool = number_formats1 == number_formats2 logger.debug("Number Format Metric: {:}".format(new_metric)) else: raise NotImplementedError("Unsupported xlsx feature: {:}".format(ftr)) @@ -73,6 +74,7 @@ def compare_table(actual: str, expected: str, **options) -> float: return float(metric) + def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float: if result is None: return 0. @@ -114,6 +116,7 @@ def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float: return float(passes) + def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float: if result is None: return 0. @@ -121,16 +124,18 @@ def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float: worksheet: Worksheet = openpyxl.load_workbook(filename=result).active return float(worksheet.freeze_panes == rules["position"]) + def check_xlsx_zoom(result: str, rules: Dict[str, Union[str, Number]]) -> float: if result is None: return 0. worksheet = openpyxl.load_workbook(filename=result).active zoom_scale: Number = worksheet.sheet_view.zoomScale or 100. - return float( getattr(operator, rules["relation"])( zoom_scale + return float(getattr(operator, rules["relation"])(zoom_scale , rules["ref_value"] ) - ) + ) + if __name__ == '__main__': # path1 = "" @@ -168,51 +173,51 @@ if __name__ == '__main__': # ] # print(check_sheet_list(path1, rule)) - #path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx" - #path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx" - #print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"])) + # path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx" + # path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx" + # print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"])) - #path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx" - #path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx" - #path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx" - #path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx" - #workbook1: Workbook = openpyxl.load_workbook(filename=path1) - #worksheet1: Worksheet = workbook1.active - #import itertools - #for col, r in itertools.product( ['A', 'B'] - #, range(1, 20) - #): - #position: str = "{:}{:d}".format(col, r) - #print(worksheet1[position]) - #print(worksheet1[position].value) - #print(worksheet1[position].number_format) - #workbook2: Workbook = openpyxl.load_workbook(filename=path2) - #worksheet2: Worksheet = workbook2.active - #for col, r in itertools.product( ['A', 'B'] - #, range(1, 20) - #): - #position: str = "{:}{:d}".format(col, r) - #print(worksheet2[position]) - #print(worksheet2[position].value) - #print(worksheet2[position].number_format) - #print(compare_table(path1, path2, features=["number_format"])) + # path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx" + # path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx" + # path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx" + # path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx" + # workbook1: Workbook = openpyxl.load_workbook(filename=path1) + # worksheet1: Worksheet = workbook1.active + # import itertools + # for col, r in itertools.product( ['A', 'B'] + # , range(1, 20) + # ): + # position: str = "{:}{:d}".format(col, r) + # print(worksheet1[position]) + # print(worksheet1[position].value) + # print(worksheet1[position].number_format) + # workbook2: Workbook = openpyxl.load_workbook(filename=path2) + # worksheet2: Worksheet = workbook2.active + # for col, r in itertools.product( ['A', 'B'] + # , range(1, 20) + # ): + # position: str = "{:}{:d}".format(col, r) + # print(worksheet2[position]) + # print(worksheet2[position].value) + # print(worksheet2[position].number_format) + # print(compare_table(path1, path2, features=["number_format"])) - #path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx" - #path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx" - #workbook1: Workbook = openpyxl.load_workbook(filename=path1) - #worksheet1: Worksheet = workbook1.active - #print(worksheet1.sheet_view.zoomScale) - #print(type(worksheet1.sheet_view.zoomScale)) -# - #import os - #import os.path - #for wb in filter( lambda f: f.endswith(".xlsx") - #, os.listdir("../../任务数据/LibreOffice Calc/") - #): - #path = os.path.join("../../任务数据/LibreOffice Calc/", wb) - #print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale) - #print(check_zoom(path1, {"relation": "lt", "ref_value": 100})) - #print(check_zoom(path2, {"relation": "lt", "ref_value": 100})) + # path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx" + # path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx" + # workbook1: Workbook = openpyxl.load_workbook(filename=path1) + # worksheet1: Worksheet = workbook1.active + # print(worksheet1.sheet_view.zoomScale) + # print(type(worksheet1.sheet_view.zoomScale)) + # + # import os + # import os.path + # for wb in filter( lambda f: f.endswith(".xlsx") + # , os.listdir("../../任务数据/LibreOffice Calc/") + # ): + # path = os.path.join("../../任务数据/LibreOffice Calc/", wb) + # print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale) + # print(check_zoom(path1, {"relation": "lt", "ref_value": 100})) + # print(check_zoom(path2, {"relation": "lt", "ref_value": 100})) path1 = "../../任务数据/LibreOffice Calc/Padding_Decimals_In_Formular_gold.xlsx" data_frame: pd.DataFrame = pd.read_excel(path1) diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py index 9efef3e..ac98d72 100644 --- a/desktop_env/evaluators/metrics/vscode.py +++ b/desktop_env/evaluators/metrics/vscode.py @@ -1,16 +1,18 @@ +from typing import Dict + + def compare_text_file(actual: str, expected: str, **options) -> float: """ Args: - actual (str): path to result xlsx - expected (str): path to gold xlsx - options (Dict[str, List[str]]): dict like - { - } + actual (str): path to result text file + expected (str): path to gold text file Return: float: the score """ - + if not actual: + return 0. + with open(actual) as f1: actual_text = f1.read() with open(expected) as f2: @@ -20,13 +22,46 @@ def compare_text_file(actual: str, expected: str, **options) -> float: return 1.0 return 0.0 -def compare_answer(actual: str, expected: str, **options) -> float: - if actual == expected: +def compare_config(actual: str, rules: Dict, **options) -> float: + if not actual: + return 0. + + with open(actual) as f1: + actual_text = f1.read() + + if actual_text == rules['expect']: return 1.0 - + return 0.0 + + +def compare_answer(actual: str, rules: Dict, **options) -> float: + """ + Args: + actual (str): result string + expected (str): gold string + + Return: + float: the score + """ + if not actual: + return 0. + + if actual == rules['expect']: + return 1.0 + # TODO: can use text embedding to get non-zero return return 0.0 -if __name__ == '__main__': - print(compare_text_file("README.md", "README.md")) \ No newline at end of file + +def is_extension_installed(actual: str, rules: Dict, **options): + if rules['type'] == 'contain': + if rules['expected'] in actual: + return 1.0 + return 0.0 + elif rules['type'] == 'not_contain': + if rules['expected'] not in actual: + return 1.0 + return 0.0 + else: + raise NotImplementedError diff --git a/desktop_env/server/README.md b/desktop_env/server/README.md index 571081a..479ab82 100644 --- a/desktop_env/server/README.md +++ b/desktop_env/server/README.md @@ -71,3 +71,10 @@ You can use accerciser to check the accessibility tree on GNOME VM. ```sh sudo apt install accerciser ``` + + +### Additional Installation +Activating the window manager control requires the installation of `wmctrl`: +```bash +sudo apt install wmctrl +``` diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py index 9a93e65..67491c1 100644 --- a/desktop_env/server/main.py +++ b/desktop_env/server/main.py @@ -3,29 +3,26 @@ import os import platform import subprocess from pathlib import Path +from typing import Any, Optional +from typing import List, Dict +import Xlib import lxml.etree -from lxml.etree import _Element import pyatspi +import pyautogui +import requests +from PIL import Image +from Xlib import display, X +from flask import Flask, request, jsonify, send_file, abort +from lxml.etree import _Element from pyatspi import Accessible, StateType +from pyatspi import Action as ATAction from pyatspi import Component, Document from pyatspi import Text as ATText from pyatspi import Value as ATValue -from pyatspi import Action as ATAction -from typing import List, Dict -from typing import Any, Optional - -import Xlib -import pyautogui -from PIL import Image -from Xlib import display, X from pyxcursor import Xcursor -import requests -from flask import Flask, request, jsonify, send_file, abort -from werkzeug.utils import secure_filename - app = Flask(__name__) pyautogui.PAUSE = 0 @@ -141,22 +138,24 @@ def get_terminal_output(): xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]' terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map) output = terminals[0].text.rstrip() if len(terminals) == 1 else None - else: # windows and macos platform is not implemented currently + else: # windows and macos platform is not implemented currently raise NotImplementedError return jsonify({"output": output, "status": "success"}) except: return jsonify({"output": None, "status": "error"}) -_accessibility_ns_map = { "st": "uri:deskat:state.at-spi.gnome.org" - , "attr": "uri:deskat:attributes.at-spi.gnome.org" - , "cp": "uri:deskat:component.at-spi.gnome.org" - , "doc": "uri:deskat:document.at-spi.gnome.org" - , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org" - , "txt": "uri:deskat:text.at-spi.gnome.org" - , "val": "uri:deskat:value.at-spi.gnome.org" - , "act": "uri:deskat:action.at-spi.gnome.org" - } +_accessibility_ns_map = {"st": "uri:deskat:state.at-spi.gnome.org" + , "attr": "uri:deskat:attributes.at-spi.gnome.org" + , "cp": "uri:deskat:component.at-spi.gnome.org" + , "doc": "uri:deskat:document.at-spi.gnome.org" + , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org" + , "txt": "uri:deskat:text.at-spi.gnome.org" + , "val": "uri:deskat:value.at-spi.gnome.org" + , "act": "uri:deskat:action.at-spi.gnome.org" + } + + def _create_node(node: Accessible) -> _Element: attribute_dict: Dict[str, Any] = {"name": node.name} @@ -164,11 +163,11 @@ def _create_node(node: Accessible) -> _Element: states: List[StateType] = node.getState().get_states() for st in states: state_name: str = StateType._enum_lookup[st] - attribute_dict[ "{{{:}}}{:}"\ - .format( _accessibility_ns_map["st"] - , state_name.split("_", maxsplit=1)[1].lower() - ) - ] = "true" + attribute_dict["{{{:}}}{:}" \ + .format(_accessibility_ns_map["st"] + , state_name.split("_", maxsplit=1)[1].lower() + ) + ] = "true" # }}} States # # Attributes {{{ # @@ -177,11 +176,11 @@ def _create_node(node: Accessible) -> _Element: attribute_name: str attribute_value: str attribute_name, attribute_value = attrbt.split(":", maxsplit=1) - attribute_dict[ "{{{:}}}{:}"\ - .format( _accessibility_ns_map["attr"] - , attribute_name - ) - ] = attribute_value + attribute_dict["{{{:}}}{:}" \ + .format(_accessibility_ns_map["attr"] + , attribute_name + ) + ] = attribute_value # }}} Attributes # # Component {{{ # @@ -190,9 +189,12 @@ def _create_node(node: Accessible) -> _Element: except NotImplementedError: pass else: - attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_SCREEN)) - attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_WINDOW)) - attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_PARENT)) + attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str( + component.getPosition(pyatspi.XY_SCREEN)) + attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str( + component.getPosition(pyatspi.XY_WINDOW)) + attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str( + component.getPosition(pyatspi.XY_PARENT)) attribute_dict["{{{:}}}size".format(_accessibility_ns_map["cp"])] = str(component.getSize()) # }}} Component # @@ -209,11 +211,11 @@ def _create_node(node: Accessible) -> _Element: attribute_name: str attribute_value: str attribute_name, attribute_value = attrbt.split(":", maxsplit=1) - attribute_dict[ "{{{:}}}{:}"\ - .format( _accessibility_ns_map["docattr"] - , attribute_name - ) - ] = attribute_value + attribute_dict["{{{:}}}{:}" \ + .format(_accessibility_ns_map["docattr"] + , attribute_name + ) + ] = attribute_value # }}} Document # # Text {{{ # @@ -223,13 +225,13 @@ def _create_node(node: Accessible) -> _Element: pass else: # only text shown on current screen is available - #attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount) + # attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount) text: str = text_obj.getText(0, text_obj.characterCount) # }}} Text # # Selection {{{ # try: - node.querySelection() + node.querySelection() except NotImplementedError: pass else: @@ -256,34 +258,36 @@ def _create_node(node: Accessible) -> _Element: else: for i in range(action.nActions): action_name: str = action.getName(i).replace(" ", "-") - attribute_dict[ "{{{:}}}{:}_desc"\ - .format( _accessibility_ns_map["act"] - , action_name - ) - ] = action.getDescription(i) - attribute_dict[ "{{{:}}}{:}_kb"\ - .format( _accessibility_ns_map["act"] - , action_name - ) - ] = action.getKeyBinding(i) + attribute_dict["{{{:}}}{:}_desc" \ + .format(_accessibility_ns_map["act"] + , action_name + ) + ] = action.getDescription(i) + attribute_dict["{{{:}}}{:}_kb" \ + .format(_accessibility_ns_map["act"] + , action_name + ) + ] = action.getKeyBinding(i) # }}} Action # - xml_node = lxml.etree.Element( node.getRoleName().replace(" ", "-") - , attrib=attribute_dict - , nsmap=_accessibility_ns_map - ) - if "text" in locals() and len(text)>0: + xml_node = lxml.etree.Element(node.getRoleName().replace(" ", "-") + , attrib=attribute_dict + , nsmap=_accessibility_ns_map + ) + if "text" in locals() and len(text) > 0: xml_node.text = text for ch in node: xml_node.append(_create_node(ch)) return xml_node + @app.route("/accessibility", methods=["GET"]) def get_accessibility_tree(): desktop: Accessible = pyatspi.Registry.getDesktop(0) desktop_xml: _Element = _create_node(desktop) return jsonify({"AT": lxml.etree.tostring(desktop_xml, encoding="unicode")}) + @app.route('/screen_size', methods=['POST']) def get_screen_size(): d = display.Display() @@ -563,5 +567,43 @@ def open_file(): return f"Failed to open {path}. Error: {e}", 500 +@app.route("/setup/activate_window", methods=['POST']) +def activate_window(): + data = request.json + window_name = data.get('window_name', None) + + os_name = platform.system() + + if os_name == 'Windows': + import pygetwindow as gw + try: + # Find the VS Code window + vscode_window = gw.getWindowsWithTitle(window_name)[0] + # Activate the window, bringing it to the front + vscode_window.activate() + except IndexError: + return "VS Code window not found.", 404 + + elif os_name == 'Darwin': + import pygetwindow as gw + try: + # Find the VS Code window + vscode_window = gw.getWindowsWithTitle(window_name)[0] + # Un-minimize the window and then bring it to the front + vscode_window.unminimize() + vscode_window.activate() + except IndexError: + return "VS Code window not found.", 404 + + elif os_name == 'Linux': + # Attempt to activate VS Code window using wmctrl + subprocess.Popen(["wmctrl", "-a", window_name]) + + else: + return f"Operating system {os_name} not supported.", 400 + + return "File opened successfully", 200 + + if __name__ == '__main__': app.run(debug=True, host="0.0.0.0") diff --git a/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json b/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json index 21c9654..ef3fe12 100644 --- a/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json +++ b/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json @@ -36,7 +36,8 @@ "expected": { "type": "rule", "rules": { - + "type": "bookmark_bar_folders_names", + "names": ["Favorites"] } } } diff --git a/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json b/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json index 4d064b5..71542bf 100644 --- a/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json +++ b/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json @@ -3,16 +3,50 @@ "snapshot": "chrome", "instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?", "source": "https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "google-chrome", + "--remote-debugging-port=1337" + ] + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "socat", + "tcp-listen:9222,fork", + "tcp:localhost:1337" + ] + } + }, + { + "type": "chrome_open_tabs", + "parameters": { + "urls_to_open": [ + "https://www.mathsisfun.com/games/2048.html" + ] + } + } + ], "trajectory": "trajectories/", "related_apps": [ "chrome" ], "evaluator": { - "func": "", + "func": "is_shortcut_on_desktop", "result": { + "type": "shortcuts_on_desktop" }, "expected": { + "type": "rule", + "rules": { + "type": "name", + "name": "Play Puzzle Game 2048" + } } } } diff --git a/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json b/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json index 2f1d7f4..f5ef7f2 100644 --- a/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json +++ b/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json @@ -1,18 +1,53 @@ { "id": "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263", "snapshot": "chrome", - "instruction": "Can you save this webpage I'm looking at to my bookmarks so I can come back to it later?", + "instruction": "Can you save this webpage I'm looking at to bookmarks bar so I can come back to it later?", "source": "https://www.youtube.com/watch?v=ZaZ8GcTxjXA", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "google-chrome", + "--remote-debugging-port=1337" + ] + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "socat", + "tcp-listen:9222,fork", + "tcp:localhost:1337" + ] + } + }, + { + "type": "chrome_open_tabs", + "parameters": { + "urls_to_open": [ + "https://blog.eleuther.ai/rotary-embeddings/", + "https://jalammar.github.io/illustrated-transformer/" + ] + } + } + ], "trajectory": "trajectories/", "related_apps": [ "chrome" ], "evaluator": { - "func": "", + "func": "is_expected_bookmarks", "result": { + "type": "bookmarks" }, "expected": { + "type": "rule", + "rules": { + "type": "bookmark_bar_websites_urls", + "urls": ["https://jalammar.github.io/illustrated-transformer/"] + } } } } diff --git a/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json b/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json index b914773..94ed902 100644 --- a/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json +++ b/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json @@ -1,18 +1,54 @@ { "id": "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3", "snapshot": "chrome", - "instruction": "Can you help me clean up my computer by getting rid of all the tracking things that websites like Amazon or eBay might have saved? I want to make sure my browsing is private and those sites don't remember me.", + "instruction": "Can you help me clean up my computer by getting rid of all the tracking things that Amazon might have saved? I want to make sure my browsing is private and those sites don't remember me.", "source": "https://support.google.com/chrome/answer/95647?hl=en&ref_topic=7438325&sjid=16867045591165135686-AP#zippy=%2Cdelete-cookies-from-a-site", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "google-chrome", + "--remote-debugging-port=1337" + ] + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "socat", + "tcp-listen:9222,fork", + "tcp:localhost:1337" + ] + } + }, + { + "type": "chrome_open_tabs", + "parameters": { + "urls_to_open": [ + "https://www.amazon.com", + "https://www.amazon.com/s?k=huggingface+transformers+book" + ] + } + } + ], "trajectory": "trajectories/", "related_apps": [ "chrome" ], "evaluator": { - "func": "", + "func": "is_cookie_deleted", "result": { + "type": "cookie_data", + "dest": "Cookies" }, "expected": { + "type": "rule", + "rules": { + "type": "domains", + "domains": [".amazon.com"] + } } } } diff --git a/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json b/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json index 2484394..2b2fd37 100644 --- a/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json +++ b/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json @@ -3,16 +3,50 @@ "snapshot": "chrome", "instruction": "Computer, can you turn the webpage I'm looking at into a PDF file and put it on my main screen, you know, the Desktop?", "source": "https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "google-chrome", + "--remote-debugging-port=1337" + ] + } + }, + { + "type": "launch", + "parameters": { + "command": [ + "socat", + "tcp-listen:9222,fork", + "tcp:localhost:1337" + ] + } + }, + { + "type": "chrome_open_tabs", + "parameters": { + "urls_to_open": [ + "https://lilianweng.github.io/posts/2023-06-23-agent/" + ] + } + } + ], "trajectory": "trajectories/", "related_apps": [ "chrome" ], "evaluator": { - "func": "", + "func": "compare_pdfs", "result": { + "type": "vm_file", + "path": "Desktop/LLM Powered Autonomous Agents _ Lil'Log.pdf", + "dest": "LLM Powered Autonomous Agents _ Lil'Log.pdf" }, "expected": { + "type": "pdf_from_url", + "path": "https://lilianweng.github.io/posts/2023-06-23-agent/", + "dest": "LLM Powered Autonomous Agents _ Lil'Log_gold.pdf" } } } diff --git a/evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4 b/evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4.json similarity index 100% rename from evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4 rename to evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4.json diff --git a/evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce b/evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce.json similarity index 100% rename from evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce rename to evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce.json diff --git a/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json b/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json index bb933e3..0a5ebb9 100644 --- a/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json +++ b/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json @@ -1,12 +1,34 @@ { "id": "455d3c66-7dc6-4537-a39a-36d3e9119df7", "snapshot": "libreoffice_impress", - "instruction": "Could you help me export impress file to image jpg file?", + "instruction": "Could you help me export an Impress file to a .jpg image file and save it as res.jpg on the Desktop? ", "source": "https://stackoverflow.com/questions/75626383/how-export-libreoffice-impress-to-image", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=12MxMjw28_t1nTLihlDpToKebjsSDsjwx&export=download&authuser=0&confirm=t&uuid=1ccc1da0-d7c7-494f-a0e3-59eb55f54e3f&at=APZUnTXvNIRMsF2cjZuFxmQzByhC:1705253210291", + "path": "Desktop/wssf-project-plan-on-a-page.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/wssf-project-plan-on-a-page.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "check_file_exists", + "file_name": "res.png", + "directory": "/home/user/Desktop/" + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json index 12be0ca..9897ead 100644 --- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json +++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json @@ -1,12 +1,37 @@ { "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54", "snapshot": "libreoffice_impress", - "instruction": "Could you help me add a strike-through on this text", + "instruction": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?", "source": "https://superuser.com/questions/1211035/libreoffice-impress-animations-how-to-strikethrough-on-click?rq=1", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1fw0baVZ15s0r1WGEBftgED2H0ljZgYtu&export=download&authuser=0&confirm=t&uuid=df03788a-81ef-4e55-b33a-2fba7ab28cb8&at=APZUnTXPb-sm88KNwmNeugbhPrzx:17052529805399", + "path": "Desktop/New_Club_Spring_2018_Training.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/New_Club_Spring_2018_Training.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "check_for_two_lines", + "result": { + "type": "vm_file", + "path": "Desktop/New_Club_Spring_2018_Training.pptx", + "dest": "New_Club_Spring_2018_Training.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json index afc59bc..6c1f0f0 100644 --- a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json +++ b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json @@ -1,12 +1,42 @@ { "id": "5d901039-a89c-4bfb-967b-bf66f4df075e", "snapshot": "libreoffice_impress", - "instruction": "Help me stretch the image to fill the entire page, keeping its proportion and centering the image", + "instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.", "source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=945b6f33-53d2-4e87-ada9-efa8b938a499&at=APZUnTVw4fKyJPW0vAAJURruAJIP:1705250184439", + "path": "Desktop/CPD_Background_Investigation_Process.pptx" + } + ] + } + }, + { + "type": "open", + "parameters": { + "path": "Desktop/CPD_Background_Investigation_Process.pptx" + } + } + ], "trajectory": "trajectories/", "related_apps": [ - "" + "libreoffice_impress" ], - "evaluator": "evaluation_dir" -} + "evaluator": { + "func": "compare_pptx_files", + "expected": { + "type": "cloud_file", + "path": "https://drive.usercontent.google.com/download?id=1rsvFPyHYiIPh1c8Nj8say0NJCG2VIDr7&export=download&authuser=0&confirm=t&uuid=aac08a92-6595-47d8-84dc-8f1ab1df987f&at=APZUnTXIWCn5B0CpLttvG2bsr_a7:1705250423565", + "dest": "CPD_Background_Investigation_Process_Gold.docx" + }, + "result": { + "type": "vm_file", + "path": "Desktop/CPD_Background_Investigation_Process.pptx", + "dest": "CPD_Background_Investigation_Process.pptx" + } + } +} \ No newline at end of file diff --git a/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json b/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json index c61b8bd..2c7a72f 100644 --- a/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json +++ b/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json @@ -1,7 +1,7 @@ { "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951", "snapshot": "vscode", - "instruction": "Could you help me find and replace \"text\" with \"test\" in this file?", + "instruction": "Please change all the places that say \"text\" to \"test\" in this document for me.", "source": "https://www.quora.com/How-do-you-find-and-replace-text-in-Visual-Studio-Code", "config": [ { @@ -16,9 +16,15 @@ } }, { - "type": "open", + "type": "launch", "parameters": { - "path": "Desktop/vscode_replace_text.txt" + "command": ["code", "Desktop/vscode_replace_text.txt"] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" } } ], diff --git a/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json b/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json index 893eada..11fb7e5 100644 --- a/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json +++ b/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json @@ -1,13 +1,50 @@ { "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb", "snapshot": "vscode", - "instruction": "Could you help me open the project at /home/user/project?", + "instruction": "I'd like the \"project\" in the \"user\" folder under \"home\" to be opened with VS Code, please.", "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ", "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, { "type": "command", "parameters": { - "command": ["mkdir", "-p", "/home/user/project"] + "command": [ + "mkdir", + "-p", + "/home/user/project/.vscode" + ] + } + }, + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1akdsiRVdq6CUtT-FX8Dpf8ruPTq6DcFn&export=download&authuser=0&confirm=t&uuid=ce2fa96a-454e-43d9-bbe3-98553b7eed0d&at=APZUnTVw_YQ1URTvP34vrmKcw0b4:1705222451052", + "path": "/home/user/project/main.py" + }, + { + "url": "https://drive.usercontent.google.com/download?id=1BkwtqtAzv_K2CrTbJZ0HbMHBffzdD9vc&export=download&authuser=0&confirm=t&uuid=28f77090-deef-49a1-b156-91317881e75e&at=APZUnTXuaR6i_3t3Prslk535GaO5:1705222457290", + "path": "/home/user/project/README.md" + }, + { + "url": "https://drive.usercontent.google.com/download?id=1ea_zF2tbcXOB8w9neBV-U5xI2nnPzIw_&export=download&authuser=0&confirm=t&uuid=9cf8c5bb-a880-475c-b80b-967a0c4fbea4&at=APZUnTUdjIj80F3Mbgi72eZDTZLO:1705222462443", + "path": "/home/user/project/.vscode/settings.json" + } + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" } } ], @@ -15,5 +52,27 @@ "related_apps": [ "vscode" ], - "evaluator": "evaluation_dir" + "evaluator": { + "postconfig": [ + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], + "func": "compare_config", + "expected": { + "type": "rule", + "rules": { + "expect": "project" + } + }, + "result": { + "type": "vscode_config", + "vscode_extension_command": "OpenProject", + "path": "OpenProject.txt", + "dest": "OpenProject.txt" + } + } } diff --git a/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json b/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json index fc05cce..956138d 100644 --- a/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json +++ b/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json @@ -1,12 +1,50 @@ { "id": "59ed65c7-e9a6-43db-833f-76d6730c0004", "snapshot": "vscode", - "instruction": "Could you help me start debugging with the breakpoint at line 15?", + "instruction": "Could you help me start debugging with the breakpoint at line 100?", "source": "https://www.youtube.com/watch?v=7qZBwhSlfOo", - "config": [], + "config": [ + { + "type": "download", + "parameters": { + "files": [ + { + "url": "https://drive.usercontent.google.com/download?id=1eLlB7UqRjh55vm0SIxb96aU1WbbK3H3T&export=download&authuser=0&confirm=t&uuid=379d1cbf-cca1-454a-a5a6-c5389024f728&at=APZUnTWn4vJZhfvrdfYZ6byVfaSj:1705159150342", + "path": "Desktop/main.py" + } + ] + } + }, + { + "type": "launch", + "parameters": { + "command": ["code", "Desktop/main.py"] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], "trajectory": "trajectories/", "related_apps": [ "vscode" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "compare_config", + "expected": { + "type": "rule", + "rules": { + "expect": "100" + } + }, + "result": { + "type": "vscode_config", + "vscode_extension_command": "GetBreakPoint", + "path": "GetBreakPoint.txt", + "dest": "GetBreakPoint.txt" + } + } } diff --git a/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json b/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json index 50fc5e7..8c0748d 100644 --- a/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json +++ b/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json @@ -3,10 +3,39 @@ "snapshot": "vscode", "instruction": "Could you help me change the color theme to Dark?", "source": "https://www.youtube.com/watch?v=ORrELERGIHs", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], "trajectory": "trajectories/982d12a5-beab-424f-8d38-d2a48429e511", "related_apps": [ "vscode" ], - "evaluator": "evaluation_dir" + "evaluator": { + "func": "compare_config", + "expected": { + "type": "rule", + "rules": { + "expect": "2" + } + }, + "result": { + "type": "vscode_config", + "vscode_extension_command": "GetColorTheme", + "path": "GetColorTheme.txt", + "dest": "GetColorTheme.txt" + } + } } diff --git a/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json b/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json index bf63054..7fb741d 100644 --- a/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json +++ b/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json @@ -3,20 +3,44 @@ "snapshot": "vscode", "instruction": "Help me install the extension Python.", "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ", - "config": [], + "config": [ + { + "type": "launch", + "parameters": { + "command": [ + "code" + ] + } + }, + { + "type": "activate_window", + "parameters": { + "window_name": "Visual Studio Code" + } + } + ], "trajectory": "trajectories/eabc805a-bfcf-4460-b250-ac92135819f6", "related_apps": [ "vscode" ], "evaluator": { - "func": "compare_answer", - "expected": { - "type": "string", - "string": "ms-python.python\n" - }, + "func": "is_extension_installed", "result": { - "type": "command_line", - "command": "code --list-extensions | grep ms-python.python" + "type": "vm_command_line", + "command": [ + "code", + "--list-extensions", + "|", + "grep", + "ms-python.python" + ] + }, + "expected": { + "type": "rule", + "rules": { + "type": "contain", + "expected": "ms-python.python" + } } } } diff --git a/experiment.py b/experiment.py new file mode 100644 index 0000000..1674f02 --- /dev/null +++ b/experiment.py @@ -0,0 +1,104 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4v_agent import GPT4v_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_trajectory"): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + observation['instruction'] = example['instruction'] + done = False + step_num = 0 + + # todo: save the screenshots and actions to a folder + while not done and step_num < max_steps: + actions = agent.predict(observation) + for action in actions: + observation, reward, done, info = env.step(action) + observation['instruction'] = example['instruction'] + step_num += 1 + logger.info("Step %d", step_num) + logger.info("Action: %s", actions) + observation.pop("accessibility_tree") + logger.info("Observation: %s", observation) + logger.info("Reward: %.2f", reward) + logger.info("Info: %s", info) + + logger.info("================================\n") + + if done: + logger.info("The episode is done.") + break + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "vlc" + example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e" + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "chrome_setup" + + api_key = os.environ.get("OPENAI_API_KEY") + agent = GPT4v_Agent(api_key=api_key, action_space=action_space) + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 20, example_trajectory_dir) diff --git a/main.py b/main.py index 6ef728c..17155de 100644 --- a/main.py +++ b/main.py @@ -1,10 +1,10 @@ +import datetime import json -from desktop_env.envs.desktop_env import DesktopEnv - import logging import os import sys -import datetime + +from desktop_env.envs.desktop_env import DesktopEnv # Logger Configs {{{ # logger = logging.getLogger() @@ -12,17 +12,18 @@ logger.setLevel(logging.DEBUG) datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") -file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str))) -debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str))) +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") stdout_handler = logging.StreamHandler(sys.stdout) -sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str))) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") file_handler.setLevel(logging.INFO) debug_handler.setLevel(logging.DEBUG) stdout_handler.setLevel(logging.INFO) sdebug_handler.setLevel(logging.DEBUG) -formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") file_handler.setFormatter(formatter) debug_handler.setFormatter(formatter) stdout_handler.setFormatter(formatter) @@ -39,6 +40,7 @@ logger.addHandler(sdebug_handler) logger = logging.getLogger("desktopenv.main") + def human_agent(): """ Runs the Gym environment with human input. @@ -76,7 +78,8 @@ def human_agent(): # } logger.info(trajectory[i]) - observation, reward, done, info = env.step(trajectory[i], pause=5) + observation, reward, done, info = env.step(trajectory[i]) + observation.pop("accessibility_tree") logger.info("Observation: %s", observation) logger.info("Reward: %.2f", reward) logger.info("Info: %s", info) @@ -87,12 +90,14 @@ def human_agent(): logger.info("The episode is done.") break + #input("PAUSING") + result = env.evaluate() logger.info("Result: %.2f", result) #input("PAUSING") - #env.close() + # env.close() logger.info("Environment closed.") diff --git a/mm_agents/gemini_agent.py b/mm_agents/gemini_agent.py new file mode 100644 index 0000000..37e22f2 --- /dev/null +++ b/mm_agents/gemini_agent.py @@ -0,0 +1,84 @@ +from typing import Dict + +import PIL.Image +import google.generativeai as genai + +from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string +from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION +from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE + + +class GeminiPro_Agent: + def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"): + genai.configure(api_key) + self.model = genai.GenerativeModel(model) + self.max_tokens = max_tokens + self.action_space = action_space + + self.trajectory = [ + { + "role": "system", + "parts": [ + { + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] + ] + } + ] + + def predict(self, obs: Dict): + """ + Predict the next action(s) based on the current observation. + """ + img = PIL.Image.open(obs["screenshot"]) + self.trajectory.append({ + "role": "user", + "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format( + obs["instruction"]), img] + }) + + traj_to_show = [] + for i in range(len(self.trajectory)): + traj_to_show.append(self.trajectory[i]["parts"][0]) + if len(self.trajectory[i]["parts"]) > 1: + traj_to_show.append("screenshot_obs") + + print("Trajectory:", traj_to_show) + + response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens) + + try: + # fixme: change to fit the new response format from gemini pro + actions = self.parse_actions(response.json()['choices'][0]['message']['content']) + except: + # todo: add error handling + print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) + actions = None + + return actions + + def parse_actions(self, response: str): + # response example + """ + ```json + { + "action_type": "CLICK", + "click_type": "RIGHT" + } + ``` + """ + + # parse from the response + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) + + # add action into the trajectory + self.trajectory.append({ + "role": "assistant", + "parts": [response] + }) + + return actions diff --git a/mm_agents/gemini_test.py b/mm_agents/gemini_test.py deleted file mode 100644 index 28ecc8a..0000000 --- a/mm_agents/gemini_test.py +++ /dev/null @@ -1,19 +0,0 @@ -import PIL.Image -import google.generativeai as genai - -genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q") - -# for m in genai.list_models(): -# if 'generateContent' in m.supported_generation_methods: -# print(m.name) - -model = genai.GenerativeModel('gemini-pro-vision') - -img = PIL.Image.open('image.jpg') - -messages = [ - {'role':'user', - 'parts': ["Explain this image.", img]} -] - -response = model.generate_content(messages) \ No newline at end of file diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index d0288e1..203b40c 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -1,12 +1,12 @@ -# fixme: Need to be rewrite on new action space - -import os -import re import base64 -from desktop_env.envs.desktop_env import Action, MouseClick import json +import re +from typing import Dict + import requests -from mm_agents.gpt_4v_prompt import SYS_PROMPT + +from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION +from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE # Function to encode the image @@ -47,11 +47,26 @@ def parse_actions_from_string(input_string): raise ValueError("Invalid response format: " + input_string) +def parse_code_from_string(input_string): + # This regular expression will match both ```code``` and ```python code``` + # and capture the `code` part. It uses a non-greedy match for the content inside. + pattern = r"```(?:\w+\s+)?(.*?)```" + # Find all non-overlapping matches in the string + matches = re.findall(pattern, input_string, re.DOTALL) + + # The regex above captures the content inside the triple backticks. + # The `re.DOTALL` flag allows the dot `.` to match newline characters as well, + # so the code inside backticks can span multiple lines. + + # matches now contains all the captured code snippets + return matches + + class GPT4v_Agent: - def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300): - self.instruction = instruction + def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"): self.model = model self.max_tokens = max_tokens + self.action_space = action_space self.headers = { "Content-Type": "application/json", @@ -64,20 +79,27 @@ class GPT4v_Agent: "content": [ { "type": "text", - "text": SYS_PROMPT + "text": { + "computer_13": SYS_PROMPT_ACTION, + "pyautogui": SYS_PROMPT_CODE + }[action_space] }, ] } ] - def predict(self, obs): - base64_image = encode_image(obs) + def predict(self, obs: Dict): + """ + Predict the next action(s) based on the current observation. + """ + base64_image = encode_image(obs["screenshot"]) self.trajectory.append({ "role": "user", "content": [ { "type": "text", - "text": "What's the next step for instruction '{}'?".format(self.instruction) + "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format( + obs["instruction"]) }, { "type": "image_url", @@ -87,12 +109,15 @@ class GPT4v_Agent: } ] }) + traj_to_show = [] for i in range(len(self.trajectory)): traj_to_show.append(self.trajectory[i]["content"][0]["text"]) if len(self.trajectory[i]["content"]) > 1: traj_to_show.append("screenshot_obs") + print("Trajectory:", traj_to_show) + payload = { "model": self.model, "messages": self.trajectory, @@ -103,6 +128,7 @@ class GPT4v_Agent: try: actions = self.parse_actions(response.json()['choices'][0]['message']['content']) except: + # todo: add error handling print("Failed to parse action from response:", response.json()['choices'][0]['message']['content']) actions = None @@ -120,7 +146,10 @@ class GPT4v_Agent: """ # parse from the response - actions = parse_actions_from_string(response) + if self.action_space == "computer_13": + actions = parse_actions_from_string(response) + elif self.action_space == "pyautogui": + actions = parse_code_from_string(response) # add action into the trajectory self.trajectory.append({ @@ -133,34 +162,4 @@ class GPT4v_Agent: ] }) - # parse action - parsed_actions = [] - for action in actions: - parsed_action = {} - action_type = Action[action['action_type']].value - parsed_action["action_type"] = action_type - - if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value: - parsed_action["click_type"] = MouseClick[action['click_type']].value - - if action_type == Action.MOUSE_MOVE.value: - parsed_action["x"] = action["x"] - parsed_action["y"] = action["y"] - - if action_type == Action.KEY.value: - parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys - - if action_type == Action.TYPE.value: - parsed_action["text"] = action["text"] - - parsed_actions.append(parsed_action) - - return parsed_actions - - -if __name__ == '__main__': - # OpenAI API Key - api_key = os.environ.get("OPENAI_API_KEY") - - agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet") - print(agent.predict(obs="stackoverflow.png")) + return actions diff --git a/mm_agents/gpt_4v_prompt.txt b/mm_agents/gpt_4v_prompt.txt deleted file mode 100644 index 5fe9c7c..0000000 --- a/mm_agents/gpt_4v_prompt.txt +++ /dev/null @@ -1,52 +0,0 @@ -You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. -For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. -Here is the description of the action space: - -Firstly you need to predict the class of your action, select from one below: -- **MOUSE_MOVE**: move the mouse to a specific position -- **CLICK**: click on the screen -- **MOUSE_DOWN**: press the mouse button -- **MOUSE_UP**: release the mouse button -- **KEY**: press a key on the keyboard -- **KEY_DOWN**: press a key on the keyboard -- **KEY_UP**: release a key on the keyboard -- **TYPE**: type a string on the keyboard - -Then you need to predict the parameters of your action: -- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor -for example, format as: -``` -{ - "action_type": "MOUSE_MOVE", - "x": 1319.11, - "y": 65.06 -} -``` -- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: -for example, format as: -``` -{ - "action_type": "CLICK", - "click_type": "LEFT" -} -``` -- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]: -for example, format as: -``` -{ - "action_type": "TYPE", - "text": [ - "w", - "i", - "k", - "i", - "p", - "e", - "d", - "i", - "a" - ] -} -``` - -For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things. \ No newline at end of file diff --git a/mm_agents/gpt_4v_prompt_action.py b/mm_agents/gpt_4v_prompt_action.py index 11705e3..650b136 100644 --- a/mm_agents/gpt_4v_prompt_action.py +++ b/mm_agents/gpt_4v_prompt_action.py @@ -1,19 +1,207 @@ SYS_PROMPT = """ You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. -Here is the description of the action space: -Firstly you need to predict the class of your action, select from one below: -- **MOUSE_MOVE**: move the mouse to a specific position -- **CLICK**: click on the screen -- **MOUSE_DOWN**: press the mouse button -- **MOUSE_UP**: release the mouse button -- **KEY**: press a key on the keyboard -- **KEY_DOWN**: press a key on the keyboard -- **KEY_UP**: release a key on the keyboard -- **TYPE**: type a string on the keyboard - -Then you need to predict the parameters of your action: +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) for example, format as: ``` @@ -48,7 +236,9 @@ for example, format as: } ``` -For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`). -You can predict multiple actions at one step, but you should only return one action for each step. +REMEMBER: +For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. +You MUST wrap the dict with backticks (\`). You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. """ \ No newline at end of file diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py index f04602c..17e8c9d 100644 --- a/mm_agents/gpt_4v_prompt_code.py +++ b/mm_agents/gpt_4v_prompt_code.py @@ -4,5 +4,8 @@ For each step, you will get an observation of an image, which is the screenshot You are required to use `pyautogui` to perform the action. Return one line or multiple lines of python code to perform the action each time, be time efficient. -Return `None` if you cannot perform the action. + +When you think you have to wait for some time, return `WAIT`. +When you think the task can not be done, return `FAIL`. +When you think the task is done, return `DONE`. """ \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 588ec1e..a13f733 100644 --- a/requirements.txt +++ b/requirements.txt @@ -29,3 +29,4 @@ opencv-python ImageHash scikit-image librosa +pymupdf