Merge branch 'main' into zdy

2024-01-15 12:12:05 +08:00
parent b9d8e6c631 7ffb5de551
commit fc289a3427
46 changed files with 1585 additions and 457 deletions
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ todo
 - [x] Set up a pipeline and build agents implementation (zero-shot) for the task
 - [x] Start to design on which tasks inside the DesktopENv to focus on, start to wrap up the environment to be public
 - [x] Start to annotate the examples for ~~training~~ and testing
- [ ] Error handling during file passing and file opening, etc.
+- [x] Error handling during file passing and file opening, etc.
- [ ] Add accessibility tree from the OS into the observation space
+- [x] Add accessibility tree from the OS into the observation space
 - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
 - [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -197,8 +197,10 @@ class PythonController:
            if "text" not in parameters:
                raise Exception(f"Unknown parameters: {parameters}")
            # deal with special ' and \ characters
-            text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
+            # text = parameters["text"].replace("\\", "\\\\").replace("'", "\\'")
-            self.execute_python_command(f"pyautogui.typewrite('{text}')")
+            # self.execute_python_command(f"pyautogui.typewrite('{text}')")
            text = parameters["text"]
            self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
        elif action_type == "PRESS":
            if "key" not in parameters:
@@ -237,6 +239,9 @@ class PythonController:
            keys_para_rep = "', '".join(keys)
            self.execute_python_command(f"pyautogui.hotkey('{keys_para_rep}')")
        elif action_type in ['WAIT', 'FAIL', 'DONE']:
            pass
        else:
            raise Exception(f"Unknown action type: {action_type}")
@@ -280,3 +285,31 @@ class PythonController:
        else:
            logger.error("Failed to get wallpaper. Status code: %d", response.status_code)
            return None
    def get_vm_desktop_path(self):
        """
        Gets the desktop path of the vm.
        """
        response = requests.post(self.http_server + "/desktop_path")
        if response.status_code == 200:
            logger.info("Desktop path downloaded successfully")
            return response.json()["desktop_path"]
        else:
            logger.error("Failed to get desktop path. Status code: %d", response.status_code)
            return None
    def get_vm_directory_tree(self, path):
        """
        Gets the directory tree of the vm.
        """
        payload = json.dumps({"path": path})
        headers = {
            'Content-Type': 'application/json'
        }
        response = requests.post(self.http_server + "/list_directory", headers=headers, data=payload)
        if response.status_code == 200:
            logger.info("Directory tree downloaded successfully")
            return response.json()["directory_tree"]
        else:
            logger.error("Failed to get directory tree. Status code: %d", response.status_code)
            return None
--- a/desktop_env/controllers/setup.py
+++ b/desktop_env/controllers/setup.py
@@ -1,18 +1,18 @@
 import json
-import time
+import logging
 import os.path
 import time
 import traceback
 import uuid
 from typing import Dict, List
 from typing import Any, Union, Optional
 from typing import Dict, List
 import requests
 from playwright.sync_api import sync_playwright
 from requests_toolbelt.multipart.encoder import MultipartEncoder
 from desktop_env.evaluators.metrics.utils import compare_urls
 import logging
 logger = logging.getLogger("desktopenv.setup")
@@ -20,6 +20,7 @@ class SetupController:
    def __init__(self, vm_ip: str, cache_dir: str):
        self.vm_ip: str = vm_ip
        self.http_server: str = f"http://{vm_ip}:5000"
        self.http_server_setup_root: str = f"http://{vm_ip}:5000/setup"
        self.cache_dir: str = cache_dir
    def reset_cache_dir(self, cache_dir: str):
@@ -57,31 +58,31 @@ class SetupController:
        # can add other setup steps
    # ZDY_COMMENT: merged with launch
-    #def _command_setup(self, command: str):
+    # def _command_setup(self, command: str):
-        #"""
+    # """
-        #Directly send a command into the virtual machine os for setting up.
+    # Directly send a command into the virtual machine os for setting up.
-        #"""
+    # """
-        #payload = json.dumps({"command": command})
+    # payload = json.dumps({"command": command})
-        #headers = {
+    # headers = {
-            #'Content-Type': 'application/json'
+    # 'Content-Type': 'application/json'
-        #}
+    # }
-        #timeout = 5
+    # timeout = 5
-        #timout_whitelist = ["vlc"]
+    # timout_whitelist = ["vlc"]
-#
+    #
-        #try:
+    # try:
-#
+    #
-            #response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout)
+    # response = requests.post(self.http_server + "/execute", headers=headers, data=payload, timeout=timeout)
-            #if response.status_code == 200:
+    # if response.status_code == 200:
-                #print("Command executed successfully:", response.text)
+    # print("Command executed successfully:", response.text)
-            #else:
+    # else:
-                #print("Failed to execute command. Status code:", response.status_code)
+    # print("Failed to execute command. Status code:", response.status_code)
-        #except requests.exceptions.Timeout as e:
+    # except requests.exceptions.Timeout as e:
-            #if command in timout_whitelist:
+    # if command in timout_whitelist:
-                #print("Command executed successfully:", command)
+    # print("Command executed successfully:", command)
-            #else:
+    # else:
-                #print("An error occurred while trying to execute the command:", e)
+    # print("An error occurred while trying to execute the command:", e)
-        #except requests.exceptions.RequestException as e:
+    # except requests.exceptions.RequestException as e:
-            #print("An error occurred while trying to execute the command:", e)
+    # print("An error occurred while trying to execute the command:", e)
    def _download_setup(self, files: List[Dict[str, str]]):
        """
@@ -224,9 +225,14 @@ class SetupController:
        except requests.exceptions.RequestException as e:
            logger.error("An error occurred while trying to send the request: %s", e)
-    def _execute_setup( self, command: List[str]
+    def _execute_setup(
-                      , stdout: str = "", stderr: str = ""
+            self,
-                      , shell: bool = False, until: Optional[Dict[str, Any]] = None):
+            command: List[str],
            stdout: str = "",
            stderr: str = "",
            shell: bool = False,
            until: Optional[Dict[str, Any]] = None
    ):
        if not command:
            raise Exception("Empty comman to launch.")
@@ -248,10 +254,10 @@ class SetupController:
                    if stderr:
                        with open(os.path.join(self.cache_dir, stderr), "w") as f:
                            f.write(results["error"])
-                    logger.info( "Command executed successfully: %s -> %s"
+                    logger.info("Command executed successfully: %s -> %s"
-                               , " ".join(command)
+                                , " ".join(command)
-                               , response.text
+                                , response.text
-                               )
+                                )
                else:
                    logger.error("Failed to launch application. Status code: %s", response.text)
                    results = None
@@ -263,13 +269,13 @@ class SetupController:
                results = None
                nb_failings += 1
-            if len(until)==0:
+            if len(until) == 0:
                terminates = True
            elif results is not None:
-                terminates = "returncode" in until and results["returncode"]==until["returncode"]\
+                terminates = "returncode" in until and results["returncode"] == until["returncode"] \
-                          or "stdout" in until and until["stdout"] in results["output"]\
+                             or "stdout" in until and until["stdout"] in results["output"] \
-                          or "stderr" in until and until["stderr"] in results["error"]
+                             or "stderr" in until and until["stderr"] in results["error"]
-            terminates = terminates or nb_failings>=5
+            terminates = terminates or nb_failings >= 5
            if not terminates:
                time.sleep(0.3)
@@ -292,6 +298,25 @@ class SetupController:
        # TODO
        raise NotImplementedError()
    def _activate_window_setup(self, window_name: str):
        if not window_name:
            raise Exception(f"Setup Open - Invalid path ({window_name}).")
        payload = json.dumps({"window_name": window_name})
        headers = {
            'Content-Type': 'application/json'
        }
        # send request to server to open file
        try:
            response = requests.post(self.http_server + "/setup" + "/activate_window", headers=headers, data=payload)
            if response.status_code == 200:
                logger.info("Command executed successfully: %s", response.text)
            else:
                logger.error(f"Failed to activate window {window_name}. Status code: %s", response.text)
        except requests.exceptions.RequestException as e:
            logger.error("An error occurred while trying to send the request: %s", e)
    # Chrome setup
    def _chrome_open_tabs_setup(self, urls_to_open: List[str]):
        host = self.vm_ip
--- a/desktop_env/envs/actions.py
+++ b/desktop_env/envs/actions.py
@@ -186,5 +186,18 @@ ACTION_SPACE = [
                "optional": False,
            }
        }
    },
    ############################################################################################################
    {
        "action_type": "WAIT",
        "note": "wait until the next action",
    },
    {
        "action_type": "FAIL",
        "note": "decide the task can not be performed",
    },
    {
        "action_type": "DONE",
        "note": "decide the task is done",
    }
 ]
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,28 +1,30 @@
 from __future__ import annotations
 import logging
 import os
 import subprocess
 import tempfile
 import time
 from typing import Callable, Any, Optional
 # import uuid
 # import platform
 from typing import List, Dict
 from typing import Callable, Any, Optional
 import tempfile
 import gymnasium as gym
 # import requests
 from desktop_env.controllers.python import PythonController
 from desktop_env.controllers.setup import SetupController
 # from desktop_env.evaluators import eval_funcs
 from desktop_env.evaluators import metrics, getters
-import logging
+# import requests
 logger = logging.getLogger("desktopenv.env")
 Metric = Callable[[Any, Any], float]
 Getter = Callable[[gym.Env, Dict[str, Any]], Any]
 def _execute_command(command: List[str]) -> None:
    if command[:4] == ["vmrun", "-T", "ws", "start"]:
        p = subprocess.Popen(command)
@@ -84,8 +86,8 @@ class DesktopEnv(gym.Env):
        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir)
        # Meta info of the VM, move to the reset() function
-        self.vm_platform: str = "" # self.controller.get_vm_platform()
+        self.vm_platform: str = ""  # self.controller.get_vm_platform()
-        self.vm_screen_size = None # self.controller.get_vm_screen_size()
+        self.vm_screen_size = None  # self.controller.get_vm_screen_size()
        # mode: human or machine
        assert action_space in ["computer_13", "pyautogui"]
@@ -164,7 +166,7 @@ class DesktopEnv(gym.Env):
            self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
        self.metric_options: Dict[str, Any] = self.evaluator.get("options", {})
-    def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None):
+    def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
        logger.info("Resetting environment...")
        logger.info("Switching task...")
@@ -202,11 +204,27 @@ class DesktopEnv(gym.Env):
        time.sleep(5)
        logger.info("Environment setup complete.")
-        observation = self._get_obs()
+        observation = {"screenshot": self._get_obs()}
        return observation
    def step(self, action, pause=0.5):
        self._step_no += 1
        self.action_history.append(action)
        reward = 0  # todo: Define reward calculation for each example
        done = False  # todo: Define episode termination condition for each example
        info = {}
        # handle the special actions
        if action in ['WAIT', 'FAIL', 'DONE']:
            if action == 'WAIT':
                time.sleep(pause)
            elif action == 'FAIL':
                done = True
                info = {"fail": True}
            elif action == 'DONE':
                done = True
                info = {"done": True}
        # fixme: add reminding logic here, decide if the action is valid for the current action_space
        if self.action_space == "computer_13":
@@ -215,18 +233,14 @@ class DesktopEnv(gym.Env):
        elif self.action_space == "pyautogui":
            # the set of all possible python commands insides `pyautogui`
            self.controller.execute_python_command(action)
        self.action_history.append(action)
        # todo: maybe for the better here we need to add a logic to wait until the rendering is done
        time.sleep(pause)
        observation = {
            "screenshot": self._get_obs(),
            "accessibility_tree": self.controller.get_accessibility_tree(),
            "terminal": self.controller.get_terminal_output(),
            "instruction": self.instruction
        }
-        reward = 0  # todo: Define reward calculation for each example
+
        done = False  # todo: Define episode termination condition for each example
        info = {}
        return observation, reward, done, info
    def evaluate(self):
--- a/desktop_env/evaluators/getters/init.py
+++ b/desktop_env/evaluators/getters/init.py
@@ -1,5 +1,9 @@
 from .chrome import get_default_search_engine, get_cookie_data, get_bookmarks, get_open_tabs_info, get_pdf_from_url, \
    get_shortcuts_on_desktop
 from .file import get_cloud_file, get_vm_file, get_cache_file
 from .general import get_vm_command_line
 from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper
 from .misc import get_rule, get_accessibility_tree
 from .replay import get_replay
 from .vlc import get_vlc_playing_info, get_vlc_config
-from .chrome import get_default_search_engine, get_bookmarks, get_open_tabs_info
+from .vscode import get_vscode_config
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -46,6 +46,10 @@ def get_default_search_engine(env, config: Dict[str, str]):
 def get_cookie_data(env, config: Dict[str, str]):
    """
    Get the cookies from the Chrome browser.
    Assume the cookies are stored in the default location, not encrypted and not large in size.
    """
    os_type = env.vm_platform
    if os_type == 'Windows':
        chrome_cookie_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
@@ -61,21 +65,23 @@ def get_cookie_data(env, config: Dict[str, str]):
    else:
        raise Exception('Unsupported operating system')
    # todo: add a new controller function to connect the cookie database
    #############
    try:
-        conn = sqlite3.connect(chrome_cookie_file_path)
+        content = env.controller.get_file(chrome_cookie_file_path)
        _path = os.path.join(env.cache_dir, config["dest"])
        with open(_path, "wb") as f:
            f.write(content)
        conn = sqlite3.connect(_path)
        cursor = conn.cursor()
        # Query to check for OpenAI cookies
        cursor.execute("SELECT * FROM cookies")
        cookies = cursor.fetchall()
        return cookies
    except Exception as e:
        logger.error(f"Error: {e}")
        return None
    #############
 def get_bookmarks(env, config: Dict[str, str]):
@@ -94,17 +100,12 @@ def get_bookmarks(env, config: Dict[str, str]):
    else:
        raise Exception('Unsupported operating system')
-    try:
+    content = env.controller.get_file(preference_file_path)
-        content = env.controller.get_file(preference_file_path)
+    if not content:
-        # make content json variable
+        return []
-        data = json.load(content)
+    data = json.loads(content)
-
+    bookmarks = data.get('roots', {})
-        bookmarks = data.get('roots', {})
+    return bookmarks
        return bookmarks
    except Exception as e:
        logger.error(f"Error: {e}")
        return None
 # todo: move this to the main.py
@@ -190,3 +191,83 @@ def get_active_tab_info(env, config: Dict[str, str]):
        browser.close()
        return active_tab_info
 def get_pdf_from_url(env, config: Dict[str, str]) -> str:
    """
    Download a PDF from a URL.
    """
    _url = config["path"]
    _path = os.path.join(env.cache_dir, config["dest"])
    host = env.vm_ip
    port = 9222  # fixme: this port is hard-coded, need to be changed from config file
    remote_debugging_url = f"http://{host}:{port}"
    with sync_playwright() as p:
        browser = p.chromium.connect_over_cdp(remote_debugging_url)
        page = browser.new_page()
        page.goto(_url)
        page.pdf(path=_path)
        browser.close()
    return _path
 # fixme: needs to be changed (maybe through post-processing) since it's not working
 def get_chrome_saved_address(env, config: Dict[str, str]):
    # host = env.vm_ip
    host = "192.168.13.130"
    port = 9222  # fixme: this port is hard-coded, need to be changed from config file
    remote_debugging_url = f"http://{host}:{port}"
    with sync_playwright() as p:
        # connect to remote Chrome instance
        browser = p.chromium.connect_over_cdp(remote_debugging_url)
        page = browser.new_page()
        # Navigate to Chrome's settings page for autofill
        page.goto("chrome://settings/addresses")
        # Get the HTML content of the page
        content = page.content()
        browser.close()
    return content
 def get_shortcuts_on_desktop(env, config: Dict[str, str]):
    # Find out the operating system
    os_name = env.vm_platform
    # Depending on the OS, define the shortcut file extension
    if os_name == 'Windows':
        # Windows shortcuts are typically .url or .lnk files
        shortcut_extension = '.lnk'
    elif os_name == 'Darwin':
        # macOS's shortcuts are .webloc files
        shortcut_extension = '.webloc'
    elif os_name == 'Linux':
        # Linux (Ubuntu, etc.) shortcuts are typically .desktop files
        shortcut_extension = '.desktop'
    else:
        logger.error(f"Unsupported operating system: {os_name}")
        return []
    # Get the path to the desktop folder
    desktop_path = env.controller.get_vm_desktop_path()
    desktop_directory_tree = env.controller.get_vm_directory_tree(desktop_path)
    shortcuts_paths = [file['name'] for file in desktop_directory_tree['children'] if
                       file['name'].endswith(shortcut_extension)]
    short_cuts = {}
    for shortcut_path in shortcuts_paths:
        short_cuts[shortcut_path] = env.controller.get_file(env.controller.execute_python_command(
            f"import os; print(os.path.join(os.path.expanduser('~'), 'Desktop', '{shortcut_path}'))")['output'].strip()).decode('utf-8')
    return short_cuts
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -40,7 +40,7 @@ def get_vm_file(env, config: Dict[str, str]) -> Optional[str]:
    file = env.controller.get_file(config["path"])
    if file is None:
        return None
-        #raise FileNotFoundError("File not found on VM: {:}".format(config["path"]))
+        # raise FileNotFoundError("File not found on VM: {:}".format(config["path"]))
    with open(_path, "wb") as f:
        f.write(file)
--- a/desktop_env/evaluators/getters/general.py
+++ b/desktop_env/evaluators/getters/general.py
@@ -1,23 +1,19 @@
 import logging
 from typing import Dict
 import os
 import requests
 logger = logging.getLogger("desktopenv.getters.general")
 def get_string(env, config: Dict[str, str]) -> str:
    """
    Config:
        string (str)
    """
-    return config["string"]
+def get_vm_command_line(env, config: Dict[str, str]):
    vm_ip = env.vm_ip
    port = 5000
    command = config["command"]
-def get_command_line(env, config: Dict[str, str]) -> str:
+    response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command})
-    """
+
-    Config:
+    if response.status_code == 200:
-        string (str)
+        return response.json()["output"]
-    """
+    else:
-    
+        logger.error("Failed to get vm command line. Status code: %d", response.status_code)
-    f = os.popen(config["command"])
+        return None
    return f.read()
--- a/desktop_env/evaluators/getters/misc.py
+++ b/desktop_env/evaluators/getters/misc.py
@@ -1,6 +1,5 @@
 import logging
 from typing import TypeVar
 #from typing import Dict, List
 logger = logging.getLogger("desktopenv.getters.misc")
@@ -13,6 +12,7 @@ def get_rule(env, config: R) -> R:
    """
    return config["rules"]
 def get_accessibility_tree(env, *args) -> str:
    accessibility_tree: str = env.controller.get_accessibility_tree()
    logger.debug("AT@eval: %s", accessibility_tree)
--- a/desktop_env/evaluators/getters/replay.py
+++ b/desktop_env/evaluators/getters/replay.py
@@ -0,0 +1,20 @@
 from typing import List, Dict, Any
 def get_replay(env, trajectory: List[Dict[str, Any]]) -> None:
    # fixme: need to be combined with the accessibility tree to activate the selection of the target window
    def parse(action):
        if action["type"] == "hotkey":
            keys = "', '".join(action["param"])
            return f"pyautogui.hotkey('{keys}')"
        if action["type"] == "typewrite":
            text = action["param"]
            return f"pyautogui.typewrite('{text}')"
        if action["type"] == "press":
            key = action["param"]
            return f"pyautogui.press('{key}')"
    for action in trajectory:
        env.controller.execute_python_command(parse(action))
--- a/desktop_env/evaluators/getters/vscode.py
+++ b/desktop_env/evaluators/getters/vscode.py
@@ -0,0 +1,34 @@
 import logging
 from typing import Any, Dict
 from .file import get_vm_file
 from .replay import get_replay
 logger = logging.getLogger("desktopenv.getters.vscode")
 def get_vscode_config(env, config: Dict[str, Any]) -> str:
    os_type = env.vm_platform
    vscode_extension_command = config["vscode_extension_command"]
    # fixme: depends on how we config and install the vscode in virtual machine, need to be aligned and double-checked
    if os_type == "MacOS":
        trajectory = [
            {"type": "hotkey", "param": ["command", "shift", "p"]},
            {"type": "typewrite", "param": vscode_extension_command},
            {"type": "press", "param": "enter"}
        ]
    else:
        trajectory = [
            {"type": "hotkey", "param": ["ctrl", "shift", "p"]},
            {"type": "typewrite", "param": vscode_extension_command},
            {"type": "press", "param": "enter"}
        ]
    get_replay(env, trajectory)
    return get_vm_file(env, {
        "path": config["path"],
        "dest": config["dest"]
    })
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -1,4 +1,4 @@
-from .chrome import is_expected_tabs, is_expected_bookmarks
+from .chrome import is_expected_tabs, is_expected_bookmarks, compare_pdfs, is_cookie_deleted, is_shortcut_on_desktop
 from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers
 from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, \
    compare_insert_equation
@@ -13,4 +13,5 @@ from .vlc import is_vlc_playing, is_vlc_recordings_folder, is_vlc_fullscreen, co
 from .gimp import increase_saturation, decrease_brightness, check_file_exists, compare_triangle_positions
 from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3
 from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter
-
+from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed
 from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -1,5 +1,9 @@
 import logging
 from typing import Any, Dict, List
 import fitz  # PyMuPDF
 import rapidfuzz.fuzz as fuzz
 from desktop_env.evaluators.metrics.utils import are_lists_equal, compare_urls
 logger = logging.getLogger("desktopenv.metrics.chrome")
@@ -22,18 +26,72 @@ def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> f
        return 0
-def is_expected_bookmarks(bookmarks: List[Dict[str, Any]], rule: Dict[str, Any]) -> float:
+def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
    """
    Checks if the expected bookmarks are in Chrome.
    """
-
+    if not bookmarks:
-    # todo
+        return 0.
-    match_type = rule['type']
+    elif rule['type'] == "bookmark_bar_folders_names":
-
+        bookmark_bar_folders_names = [bookmark['name'] for bookmark in bookmarks['bookmark_bar']['children'] if
-    if match_type == "url":
+                                      bookmark['type'] == 'folder']
-        expected_urls = rule['urls']
+        return 1. if set(bookmark_bar_folders_names) == set(rule['names']) else 0.
-        actual_urls = [bookmark['url'] for bookmark in bookmarks]
+    elif rule['type'] == "bookmark_bar_websites_urls":
-        return 1 if are_lists_equal(expected_urls, actual_urls, compare_urls) else 0
+        bookmark_bar_websites_urls = [bookmark['url'] for bookmark in bookmarks['bookmark_bar']['children'] if
                                      bookmark['type'] == 'url']
        return 1. if set(bookmark_bar_websites_urls) == set(rule['urls']) else 0.
    else:
-        logger.error(f"Unknown type: {match_type}")
+        raise TypeError(f"{rule['type']} not support yet!")
-        return 0
+
 def compare_pdfs(pdf1_path, pdf2_path):
    """
    Compare two PDF files.
    """
    def extract_text_from_pdf(pdf_path):
        """Extract text from each page of the PDF."""
        text = ""
        with fitz.open(pdf_path) as pdf:
            for page in pdf:
                text += page.get_text()
        return text.strip()
    text1 = extract_text_from_pdf(pdf1_path)
    text2 = extract_text_from_pdf(pdf2_path)
    return fuzz.ratio(text1, text2) / 100
 def is_cookie_deleted(cookie_data, rule):
    """
    Check if the cookie is deleted.
    """
    if rule['type'] == 'domains':
        cookies_domains = [cookie[1] for cookie in cookie_data]
        for domain in rule['domains']:
            for cookies_domain in cookies_domains:
                if compare_urls(domain, cookies_domain):
                    return 0.
        return 1.
    else:
        raise TypeError(f"{rule['type']} not support yet!")
 def is_shortcut_on_desktop(shortcuts: Dict[str, str], rule):
    """
    Check if the shortcut is on the desktop.
    """
    # fixme: if the name of the website changed in the future, this will not work; can be replaced with url
    if rule['type'] == 'name':
        for shortcut_path, shortcut_content in shortcuts.items():
            if "Name=" + rule['name'] + "\n" in shortcut_content:
                return 1.
        return 0.
    elif rule['type'] == 'url':
        raise TypeError(f"{rule['type']} not support yet!")
    elif rule['type'] == 'id':
        raise TypeError(f"{rule['type']} not support yet!")
    else:
        raise TypeError(f"{rule['type']} not support yet!")
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -1,12 +1,14 @@
-import xml.etree.ElementTree as ET
+import logging
 import os
 import xml.etree.ElementTree as ET
 from typing import List, Dict, Any
 from docx import Document
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
 import logging
 logger = logging.getLogger("desktopenv.metric.docs")
 def find_default_font(config_file_path, rules):
    """Find the default font in LibreOffice Writer."""
    default_font = None
--- a/desktop_env/evaluators/metrics/impress.py
+++ b/desktop_env/evaluators/metrics/impress.py
@@ -1,4 +1,75 @@
 from pptx import Presentation
 import os
 def is_red_color(color):
    #judge if the color is red
    print(color.rgb)
    return color and color.rgb == (255, 0, 0)
 def get_master_placeholder_color(prs):
    # get the color of the placeholder
    masters = prs.slide_masters
    for idx, master in enumerate(masters):
        for placeholder in master.placeholders:            
            if placeholder.has_text_frame and placeholder.text == "<number>":            
                text_frame = placeholder.text_frame
                if text_frame.paragraphs:
                    first_paragraph = text_frame.paragraphs[0]
                    return first_paragraph.font.color    
    return None  
 def check_slide_numbers_color(pptx_file_path):
    presentation = Presentation(pptx_file_path)
    for i, slide in enumerate(presentation.slides):
        for shape in slide.shapes:
            # check if the shape is a text box
            if hasattr(shape, "text"):
                if shape.text.isdigit():
                    # "SlidePlaceholder" is the name of the placeholder in the master slide
                    page_number_text = shape.text
                    font_color = get_master_placeholder_color(presentation)
                    print(font_color)
                    return 1 if font_color is not None and is_red_color(font_color) else 0
 def compare_pptx_files(file1_path, file2_path):
    prs1 = Presentation(file1_path)
    prs2 = Presentation(file2_path)
    # compare the number of slides
    if len(prs1.slides) != len(prs2.slides):
        return 0
    # compare the content of each slide
    for slide1, slide2 in zip(prs1.slides, prs2.slides):
        # check if the shapes are the same
        for shape1, shape2 in zip(slide1.shapes, slide2.shapes):
            if hasattr(shape1, "text") and hasattr(shape2, "text"):
                if shape1.text != shape2.text:
                    return 0
    return 1
 def has_two_lines_on_page(slide):
    line_count = 0
    for shape in slide.shapes:
        if shape.shape_type == 1:  # 1 表示 Line 形状
            line_count += 1
            if line_count >= 2:
                return True
    return False
 def check_for_two_lines(prs):
    prs = Presentation(prs)
    for i, slide in enumerate(prs.slides):
        if has_two_lines_on_page(slide):
            return 1
    return 0
 def check_file_exists(directory, filename):
    file_path = os.path.join(directory, filename)
    return 1 if os.path.isfile(file_path) else 0
 if __name__ == "__main__":
    path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx"
--- a/desktop_env/evaluators/metrics/libreoffice.py
+++ b/desktop_env/evaluators/metrics/libreoffice.py
@@ -1,37 +1,38 @@
 import lxml.cssselect
 from lxml.etree import _Element as Element
 import lxml.etree
 import fnmatch
 from typing import Dict, List
 import lxml.cssselect
 import lxml.etree
 from lxml.etree import _Element as Element
 _libconf_namespaces = [("oor", "http://openoffice.org/2001/registry")]
 _libconf_ns_mapping = dict(_libconf_namespaces)
-_setup_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value'
+_setup_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooSetupSystemLocale]>value',
-                                                   , namespaces=_libconf_ns_mapping
+                                                    namespaces=_libconf_ns_mapping)
-                                                   )
+_locale_selector = lxml.cssselect.CSSSelector('item[oor|path$=L10N]>prop[oor|name=ooLocale]>value',
-_locale_selector = lxml.cssselect.CSSSelector( 'item[oor|path$=L10N]>prop[oor|name=ooLocale]>value'
+                                              namespaces=_libconf_ns_mapping)
-                                             , namespaces=_libconf_ns_mapping
+
-                                             )
+
 def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
    config: Element = lxml.etree.parse(config_file).getroot()
    setup_locale_setting: List[Element] = _setup_locale_selector(config)
    locale_setting: List[Element] = _locale_selector(config)
-    setup_locale_setting: str = setup_locale_setting[0].text\
+    setup_locale_setting: str = setup_locale_setting[0].text \
-                             if len(setup_locale_setting)>0\
+        if len(setup_locale_setting) > 0 \
-                           else locale_setting[0].text
+        else locale_setting[0].text
-    return float( any( fnmatch.fnmatchcase(setup_locale_setting, ptn)\
+    return float(any(fnmatch.fnmatchcase(setup_locale_setting, ptn) \
-                   for ptn in rules["locale_set"]
+                     for ptn in rules["locale_set"]
                     )
-                )
+                 )
 if __name__ == "__main__":
    path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu"
-    print( check_libre_locale( path1, { "locale_set": [ "ru-*", "de-*", "fr-*"
+    print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*"
-		     		                                  , "pt-*", "es-*", "it-*"
+        , "pt-*", "es-*", "it-*"
-                                                      ]
+                                                    ]
-                                      }
+                                     }
                             )
-         )
+          )
--- a/desktop_env/evaluators/metrics/pdf.py
+++ b/desktop_env/evaluators/metrics/pdf.py
@@ -1,13 +1,11 @@
 from pypdf import PdfReader
 import operator
 from typing import Dict
 from typing import Any
 from typing import Dict
 from pypdf import PdfReader
 def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
    reader = PdfReader(pdf_file)
    nb_pages: int = len(reader.pages)
-    return float( getattr(operator, rules["relation"])( nb_pages
+    return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
                                                      , rules["ref_value"]
                                                      )
                )
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -1,18 +1,19 @@
-import pandas as pd
+import logging
 import operator
 from numbers import Number
 from typing import Any, Union
 from typing import Dict, List
 import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.worksheet.worksheet import Worksheet
 from .utils import load_charts, load_sparklines
 import operator
 from typing import Dict, List
 from typing import Any, Union
 from numbers import Number
 import logging
 logger = logging.getLogger("desktopenv.metric.table")
 def compare_table(actual: str, expected: str, **options) -> float:
    """
    Args:
@@ -44,28 +45,28 @@ def compare_table(actual: str, expected: str, **options) -> float:
        workbook1: Workbook = openpyxl.load_workbook(actual)
        workbook2: Workbook = openpyxl.load_workbook(expected)
-        if ftr=="sparkline":
+        if ftr == "sparkline":
            sp1 = load_sparklines(actual)
            sp2 = load_sparklines(expected)
            new_metric: bool = sp1 == sp2
            logger.debug("Sparkline Metric: {:}".format(new_metric))
-        elif ftr=="chart":
+        elif ftr == "chart":
            charts1 = load_charts(workbook1, **options)
            charts2 = load_charts(workbook2, **options)
            new_metric: bool = charts1 == charts2
            logger.debug("Chart Metric: {:}".format(new_metric))
-        elif ftr=="number_format":
+        elif ftr == "number_format":
-            number_formats1: List[str] = [ c.number_format.lower()\
+            number_formats1: List[str] = [c.number_format.lower() \
-                                           for col in workbook1.active.iter_cols()\
+                                          for col in workbook1.active.iter_cols() \
-                                            for c in col\
+                                          for c in col \
-                                            if c.data_type=="n"
+                                          if c.data_type == "n"
-                                         ]
+                                          ]
-            number_formats2: List[str] = [ c.number_format.lower()\
+            number_formats2: List[str] = [c.number_format.lower() \
-                                           for col in workbook2.active.iter_cols()\
+                                          for col in workbook2.active.iter_cols() \
-                                            for c in col\
+                                          for c in col \
-                                            if c.data_type=="n"
+                                          if c.data_type == "n"
-                                         ]
+                                          ]
-            new_metric: bool = number_formats1==number_formats2
+            new_metric: bool = number_formats1 == number_formats2
            logger.debug("Number Format Metric: {:}".format(new_metric))
        else:
            raise NotImplementedError("Unsupported xlsx feature: {:}".format(ftr))
@@ -73,6 +74,7 @@ def compare_table(actual: str, expected: str, **options) -> float:
    return float(metric)
 def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
    if result is None:
        return 0.
@@ -114,6 +116,7 @@ def check_sheet_list(result: str, rules: List[Dict[str, Any]]) -> float:
    return float(passes)
 def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
    if result is None:
        return 0.
@@ -121,16 +124,18 @@ def check_xlsx_freeze(result: str, rules: Dict[str, str]) -> float:
    worksheet: Worksheet = openpyxl.load_workbook(filename=result).active
    return float(worksheet.freeze_panes == rules["position"])
 def check_xlsx_zoom(result: str, rules: Dict[str, Union[str, Number]]) -> float:
    if result is None:
        return 0.
    worksheet = openpyxl.load_workbook(filename=result).active
    zoom_scale: Number = worksheet.sheet_view.zoomScale or 100.
-    return float( getattr(operator, rules["relation"])( zoom_scale
+    return float(getattr(operator, rules["relation"])(zoom_scale
                                                      , rules["ref_value"]
                                                      )
-                )
+                 )
 if __name__ == '__main__':
    # path1 = ""
@@ -168,51 +173,51 @@ if __name__ == '__main__':
    # ]
    # print(check_sheet_list(path1, rule))
-    #path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
+    # path1 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold.xlsx"
-    #path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx"
+    # path2 = "../../任务数据/LibreOffice Calc/Create_column_charts_using_statistics_gold2.xlsx"
-    #print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"]))
+    # print(compare_table(path1, path2, features=["chart"], chart_props=["type", "direction"]))
-    #path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx"
+    # path1 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold.xlsx"
-    #path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx"
+    # path2 = "../../任务数据/LibreOffice Calc/Represent_in_millions_billions_gold3.xlsx"
-    #path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx"
+    # path1 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot.xlsx"
-    #path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx"
+    # path2 = "../../任务数据/LibreOffice Calc/Set_Decimal_Separator_Dot_gold.xlsx"
-    #workbook1: Workbook = openpyxl.load_workbook(filename=path1)
+    # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
-    #worksheet1: Worksheet = workbook1.active
+    # worksheet1: Worksheet = workbook1.active
-    #import itertools
+    # import itertools
-    #for col, r in itertools.product( ['A', 'B']
+    # for col, r in itertools.product( ['A', 'B']
-                                   #, range(1, 20)
+    # , range(1, 20)
-                                   #):
+    # ):
-        #position: str = "{:}{:d}".format(col, r)
+    # position: str = "{:}{:d}".format(col, r)
-        #print(worksheet1[position])
+    # print(worksheet1[position])
-        #print(worksheet1[position].value)
+    # print(worksheet1[position].value)
-        #print(worksheet1[position].number_format)
+    # print(worksheet1[position].number_format)
-    #workbook2: Workbook = openpyxl.load_workbook(filename=path2)
+    # workbook2: Workbook = openpyxl.load_workbook(filename=path2)
-    #worksheet2: Worksheet = workbook2.active
+    # worksheet2: Worksheet = workbook2.active
-    #for col, r in itertools.product( ['A', 'B']
+    # for col, r in itertools.product( ['A', 'B']
-                                   #, range(1, 20)
+    # , range(1, 20)
-                                   #):
+    # ):
-        #position: str = "{:}{:d}".format(col, r)
+    # position: str = "{:}{:d}".format(col, r)
-        #print(worksheet2[position])
+    # print(worksheet2[position])
-        #print(worksheet2[position].value)
+    # print(worksheet2[position].value)
-        #print(worksheet2[position].number_format)
+    # print(worksheet2[position].number_format)
-    #print(compare_table(path1, path2, features=["number_format"]))
+    # print(compare_table(path1, path2, features=["number_format"]))
-    #path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx"
+    # path1 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells_gold.xlsx"
-    #path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx"
+    # path2 = "../../任务数据/LibreOffice Calc/Zoom_Out_Oversized_Cells.xlsx"
-    #workbook1: Workbook = openpyxl.load_workbook(filename=path1)
+    # workbook1: Workbook = openpyxl.load_workbook(filename=path1)
-    #worksheet1: Worksheet = workbook1.active
+    # worksheet1: Worksheet = workbook1.active
-    #print(worksheet1.sheet_view.zoomScale)
+    # print(worksheet1.sheet_view.zoomScale)
-    #print(type(worksheet1.sheet_view.zoomScale))
+    # print(type(worksheet1.sheet_view.zoomScale))
-#
+    #
-    #import os
+    # import os
-    #import os.path
+    # import os.path
-    #for wb in filter( lambda f: f.endswith(".xlsx")
+    # for wb in filter( lambda f: f.endswith(".xlsx")
-                    #, os.listdir("../../任务数据/LibreOffice Calc/")
+    # , os.listdir("../../任务数据/LibreOffice Calc/")
-                    #):
+    # ):
-        #path = os.path.join("../../任务数据/LibreOffice Calc/", wb)
+    # path = os.path.join("../../任务数据/LibreOffice Calc/", wb)
-        #print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale)
+    # print(wb, openpyxl.load_workbook(filename=path).active.sheet_view.zoomScale)
-    #print(check_zoom(path1, {"relation": "lt", "ref_value": 100}))
+    # print(check_zoom(path1, {"relation": "lt", "ref_value": 100}))
-    #print(check_zoom(path2, {"relation": "lt", "ref_value": 100}))
+    # print(check_zoom(path2, {"relation": "lt", "ref_value": 100}))
    path1 = "../../任务数据/LibreOffice Calc/Padding_Decimals_In_Formular_gold.xlsx"
    data_frame: pd.DataFrame = pd.read_excel(path1)
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -1,16 +1,18 @@
 from typing import Dict
 def compare_text_file(actual: str, expected: str, **options) -> float:
    """
    Args:
-        actual (str): path to result xlsx
+        actual (str): path to result text file
-        expected (str): path to gold xlsx
+        expected (str): path to gold text file
        options (Dict[str, List[str]]): dict like
          {
          }
    Return:
        float: the score
    """
-    
+    if not actual:
        return 0.
    with open(actual) as f1:
        actual_text = f1.read()
    with open(expected) as f2:
@@ -20,13 +22,46 @@ def compare_text_file(actual: str, expected: str, **options) -> float:
        return 1.0
    return 0.0
 def compare_answer(actual: str, expected: str, **options) -> float:
-    if actual == expected:
+def compare_config(actual: str, rules: Dict, **options) -> float:
    if not actual:
        return 0.
    with open(actual) as f1:
        actual_text = f1.read()
    if actual_text == rules['expect']:
        return 1.0
-    
+    return 0.0
 def compare_answer(actual: str, rules: Dict, **options) -> float:
    """
    Args:
        actual (str): result string
        expected (str): gold string
    Return:
        float: the score
    """
    if not actual:
        return 0.
    if actual == rules['expect']:
        return 1.0
    # TODO: can use text embedding to get non-zero return
    return 0.0
-if __name__ == '__main__':
+
-    print(compare_text_file("README.md", "README.md"))
+def is_extension_installed(actual: str, rules: Dict, **options):
    if rules['type'] == 'contain':
        if rules['expected'] in actual:
            return 1.0
        return 0.0
    elif rules['type'] == 'not_contain':
        if rules['expected'] not in actual:
            return 1.0
        return 0.0
    else:
        raise NotImplementedError
--- a/desktop_env/server/README.md
+++ b/desktop_env/server/README.md
@@ -71,3 +71,10 @@ You can use accerciser to check the accessibility tree on GNOME VM.
 ```sh
 sudo apt install accerciser
 ```
 ### Additional Installation
 Activating the window manager control requires the installation of `wmctrl`:
 ```bash
 sudo apt install wmctrl
 ```
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -3,29 +3,26 @@ import os
 import platform
 import subprocess
 from pathlib import Path
 from typing import Any, Optional
 from typing import List, Dict
 import Xlib
 import lxml.etree
 from lxml.etree import _Element
 import pyatspi
 import pyautogui
 import requests
 from PIL import Image
 from Xlib import display, X
 from flask import Flask, request, jsonify, send_file, abort
 from lxml.etree import _Element
 from pyatspi import Accessible, StateType
 from pyatspi import Action as ATAction
 from pyatspi import Component, Document
 from pyatspi import Text as ATText
 from pyatspi import Value as ATValue
 from pyatspi import Action as ATAction
 from typing import List, Dict
 from typing import Any, Optional
 import Xlib
 import pyautogui
 from PIL import Image
 from Xlib import display, X
 from pyxcursor import Xcursor
 import requests
 from flask import Flask, request, jsonify, send_file, abort
 from werkzeug.utils import secure_filename
 app = Flask(__name__)
 pyautogui.PAUSE = 0
@@ -141,22 +138,24 @@ def get_terminal_output():
                xpath = '//application[@name="gnome-terminal-server"]/frame[@st:active="true"]//terminal[@st:focused="true"]'
                terminals: List[_Element] = desktop_xml.xpath(xpath, namespaces=_accessibility_ns_map)
                output = terminals[0].text.rstrip() if len(terminals) == 1 else None
-        else: # windows and macos platform is not implemented currently
+        else:  # windows and macos platform is not implemented currently
            raise NotImplementedError
        return jsonify({"output": output, "status": "success"})
    except:
        return jsonify({"output": None, "status": "error"})
-_accessibility_ns_map = { "st": "uri:deskat:state.at-spi.gnome.org"
+_accessibility_ns_map = {"st": "uri:deskat:state.at-spi.gnome.org"
-                        , "attr": "uri:deskat:attributes.at-spi.gnome.org"
+    , "attr": "uri:deskat:attributes.at-spi.gnome.org"
-                        , "cp": "uri:deskat:component.at-spi.gnome.org"
+    , "cp": "uri:deskat:component.at-spi.gnome.org"
-                        , "doc": "uri:deskat:document.at-spi.gnome.org"
+    , "doc": "uri:deskat:document.at-spi.gnome.org"
-                        , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org"
+    , "docattr": "uri:deskat:attributes.document.at-spi.gnome.org"
-                        , "txt": "uri:deskat:text.at-spi.gnome.org"
+    , "txt": "uri:deskat:text.at-spi.gnome.org"
-                        , "val": "uri:deskat:value.at-spi.gnome.org"
+    , "val": "uri:deskat:value.at-spi.gnome.org"
-                        , "act": "uri:deskat:action.at-spi.gnome.org"
+    , "act": "uri:deskat:action.at-spi.gnome.org"
-                        }
+                         }
 def _create_node(node: Accessible) -> _Element:
    attribute_dict: Dict[str, Any] = {"name": node.name}
@@ -164,11 +163,11 @@ def _create_node(node: Accessible) -> _Element:
    states: List[StateType] = node.getState().get_states()
    for st in states:
        state_name: str = StateType._enum_lookup[st]
-        attribute_dict[ "{{{:}}}{:}"\
+        attribute_dict["{{{:}}}{:}" \
-                            .format( _accessibility_ns_map["st"]
+            .format(_accessibility_ns_map["st"]
-                                   , state_name.split("_", maxsplit=1)[1].lower()
+                    , state_name.split("_", maxsplit=1)[1].lower()
-                                   )
+                    )
-                      ] = "true"
+        ] = "true"
    #  }}} States # 
    #  Attributes {{{ # 
@@ -177,11 +176,11 @@ def _create_node(node: Accessible) -> _Element:
        attribute_name: str
        attribute_value: str
        attribute_name, attribute_value = attrbt.split(":", maxsplit=1)
-        attribute_dict[ "{{{:}}}{:}"\
+        attribute_dict["{{{:}}}{:}" \
-                            .format( _accessibility_ns_map["attr"]
+            .format(_accessibility_ns_map["attr"]
-                                   , attribute_name
+                    , attribute_name
-                                   )
+                    )
-                      ] = attribute_value
+        ] = attribute_value
    #  }}} Attributes # 
    #  Component {{{ # 
@@ -190,9 +189,12 @@ def _create_node(node: Accessible) -> _Element:
    except NotImplementedError:
        pass
    else:
-        attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_SCREEN))
+        attribute_dict["{{{:}}}screencoord".format(_accessibility_ns_map["cp"])] = str(
-        attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_WINDOW))
+            component.getPosition(pyatspi.XY_SCREEN))
-        attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str(component.getPosition(pyatspi.XY_PARENT))
+        attribute_dict["{{{:}}}windowcoord".format(_accessibility_ns_map["cp"])] = str(
            component.getPosition(pyatspi.XY_WINDOW))
        attribute_dict["{{{:}}}parentcoord".format(_accessibility_ns_map["cp"])] = str(
            component.getPosition(pyatspi.XY_PARENT))
        attribute_dict["{{{:}}}size".format(_accessibility_ns_map["cp"])] = str(component.getSize())
    #  }}} Component # 
@@ -209,11 +211,11 @@ def _create_node(node: Accessible) -> _Element:
            attribute_name: str
            attribute_value: str
            attribute_name, attribute_value = attrbt.split(":", maxsplit=1)
-            attribute_dict[ "{{{:}}}{:}"\
+            attribute_dict["{{{:}}}{:}" \
-                                .format( _accessibility_ns_map["docattr"]
+                .format(_accessibility_ns_map["docattr"]
-                                       , attribute_name
+                        , attribute_name
-                                       )
+                        )
-                          ] = attribute_value
+            ] = attribute_value
    #  }}} Document # 
    #  Text {{{ # 
@@ -223,13 +225,13 @@ def _create_node(node: Accessible) -> _Element:
        pass
    else:
        # only text shown on current screen is available
-        #attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount)
+        # attribute_dict["txt:text"] = text_obj.getText(0, text_obj.characterCount)
        text: str = text_obj.getText(0, text_obj.characterCount)
    #  }}} Text # 
    #  Selection {{{ # 
    try:
-       node.querySelection()
+        node.querySelection()
    except NotImplementedError:
        pass
    else:
@@ -256,34 +258,36 @@ def _create_node(node: Accessible) -> _Element:
    else:
        for i in range(action.nActions):
            action_name: str = action.getName(i).replace(" ", "-")
-            attribute_dict[ "{{{:}}}{:}_desc"\
+            attribute_dict["{{{:}}}{:}_desc" \
-                                .format( _accessibility_ns_map["act"]
+                .format(_accessibility_ns_map["act"]
-                                       , action_name
+                        , action_name
-                                       )
+                        )
-                          ] = action.getDescription(i)
+            ] = action.getDescription(i)
-            attribute_dict[ "{{{:}}}{:}_kb"\
+            attribute_dict["{{{:}}}{:}_kb" \
-                                .format( _accessibility_ns_map["act"]
+                .format(_accessibility_ns_map["act"]
-                                       , action_name
+                        , action_name
-                                       )
+                        )
-                          ] = action.getKeyBinding(i)
+            ] = action.getKeyBinding(i)
    #  }}} Action # 
-    xml_node = lxml.etree.Element( node.getRoleName().replace(" ", "-")
+    xml_node = lxml.etree.Element(node.getRoleName().replace(" ", "-")
-                                 , attrib=attribute_dict
+                                  , attrib=attribute_dict
-                                 , nsmap=_accessibility_ns_map
+                                  , nsmap=_accessibility_ns_map
-                                 )
+                                  )
-    if "text" in locals() and len(text)>0:
+    if "text" in locals() and len(text) > 0:
        xml_node.text = text
    for ch in node:
        xml_node.append(_create_node(ch))
    return xml_node
@app.route("/accessibility", methods=["GET"])
 def get_accessibility_tree():
    desktop: Accessible = pyatspi.Registry.getDesktop(0)
    desktop_xml: _Element = _create_node(desktop)
    return jsonify({"AT": lxml.etree.tostring(desktop_xml, encoding="unicode")})
@app.route('/screen_size', methods=['POST'])
 def get_screen_size():
    d = display.Display()
@@ -563,5 +567,43 @@ def open_file():
        return f"Failed to open {path}. Error: {e}", 500
@app.route("/setup/activate_window", methods=['POST'])
 def activate_window():
    data = request.json
    window_name = data.get('window_name', None)
    os_name = platform.system()
    if os_name == 'Windows':
        import pygetwindow as gw
        try:
            # Find the VS Code window
            vscode_window = gw.getWindowsWithTitle(window_name)[0]
            # Activate the window, bringing it to the front
            vscode_window.activate()
        except IndexError:
            return "VS Code window not found.", 404
    elif os_name == 'Darwin':
        import pygetwindow as gw
        try:
            # Find the VS Code window
            vscode_window = gw.getWindowsWithTitle(window_name)[0]
            # Un-minimize the window and then bring it to the front
            vscode_window.unminimize()
            vscode_window.activate()
        except IndexError:
            return "VS Code window not found.", 404
    elif os_name == 'Linux':
        # Attempt to activate VS Code window using wmctrl
        subprocess.Popen(["wmctrl", "-a", window_name])
    else:
        return f"Operating system {os_name} not supported.", 400
    return "File opened successfully", 200
 if __name__ == '__main__':
    app.run(debug=True, host="0.0.0.0")
--- a/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json
+++ b/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json
@@ -36,7 +36,8 @@
    "expected": {
      "type": "rule",
      "rules": {
-
+        "type": "bookmark_bar_folders_names",
        "names": ["Favorites"]
      }
    }
  }
--- a/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json
+++ b/evaluation_examples/examples/chrome/35253b65-1c19-4304-8aa4-6884b8218fc0.json
@@ -3,16 +3,50 @@
  "snapshot": "chrome",
  "instruction": "Hey, I need a quick way back to this site. Could you whip up a shortcut on my desktop for me?",
  "source": "https://www.laptopmag.com/articles/how-to-create-desktop-shortcuts-for-web-pages-using-chrome",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "google-chrome",
          "--remote-debugging-port=1337"
        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
        "command": [
          "socat",
          "tcp-listen:9222,fork",
          "tcp:localhost:1337"
        ]
      }
    },
    {
      "type": "chrome_open_tabs",
      "parameters": {
        "urls_to_open": [
          "https://www.mathsisfun.com/games/2048.html"
        ]
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "chrome"
  ],
  "evaluator": {
-    "func": "",
+    "func": "is_shortcut_on_desktop",
    "result": {
      "type": "shortcuts_on_desktop"
    },
    "expected": {
      "type": "rule",
      "rules": {
        "type": "name",
        "name": "Play Puzzle Game 2048"
      }
    }
  }
 }
--- a/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json
+++ b/evaluation_examples/examples/chrome/7a5a7856-f1b6-42a4-ade9-1ca81ca0f263.json
@@ -1,18 +1,53 @@
 {
  "id": "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
  "snapshot": "chrome",
-  "instruction": "Can you save this webpage I'm looking at to my bookmarks so I can come back to it later?",
+  "instruction": "Can you save this webpage I'm looking at to bookmarks bar so I can come back to it later?",
  "source": "https://www.youtube.com/watch?v=ZaZ8GcTxjXA",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "google-chrome",
          "--remote-debugging-port=1337"
        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
        "command": [
          "socat",
          "tcp-listen:9222,fork",
          "tcp:localhost:1337"
        ]
      }
    },
    {
      "type": "chrome_open_tabs",
      "parameters": {
        "urls_to_open": [
          "https://blog.eleuther.ai/rotary-embeddings/",
          "https://jalammar.github.io/illustrated-transformer/"
        ]
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "chrome"
  ],
  "evaluator": {
-    "func": "",
+    "func": "is_expected_bookmarks",
    "result": {
      "type": "bookmarks"
    },
    "expected": {
      "type": "rule",
      "rules": {
        "type": "bookmark_bar_websites_urls",
        "urls": ["https://jalammar.github.io/illustrated-transformer/"]
      }
    }
  }
 }
--- a/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json
+++ b/evaluation_examples/examples/chrome/7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3.json
@@ -1,18 +1,54 @@
 {
  "id": "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
  "snapshot": "chrome",
-  "instruction": "Can you help me clean up my computer by getting rid of all the tracking things that websites like Amazon or eBay might have saved? I want to make sure my browsing is private and those sites don't remember me.",
+  "instruction": "Can you help me clean up my computer by getting rid of all the tracking things that Amazon might have saved? I want to make sure my browsing is private and those sites don't remember me.",
  "source": "https://support.google.com/chrome/answer/95647?hl=en&ref_topic=7438325&sjid=16867045591165135686-AP#zippy=%2Cdelete-cookies-from-a-site",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "google-chrome",
          "--remote-debugging-port=1337"
        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
        "command": [
          "socat",
          "tcp-listen:9222,fork",
          "tcp:localhost:1337"
        ]
      }
    },
    {
      "type": "chrome_open_tabs",
      "parameters": {
        "urls_to_open": [
          "https://www.amazon.com",
          "https://www.amazon.com/s?k=huggingface+transformers+book"
        ]
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "chrome"
  ],
  "evaluator": {
-    "func": "",
+    "func": "is_cookie_deleted",
    "result": {
      "type": "cookie_data",
      "dest": "Cookies"
    },
    "expected": {
      "type": "rule",
      "rules": {
        "type": "domains",
        "domains": [".amazon.com"]
      }
    }
  }
 }
--- a/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json
+++ b/evaluation_examples/examples/chrome/e1e75309-3ddb-4d09-92ec-de869c928143.json
@@ -3,16 +3,50 @@
  "snapshot": "chrome",
  "instruction": "Computer, can you turn the webpage I'm looking at into a PDF file and put it on my main screen, you know, the Desktop?",
  "source": "https://in5stepstutorials.com/google-chrome/save-web-page-as-pdf-in-chrome.php",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "google-chrome",
          "--remote-debugging-port=1337"
        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
        "command": [
          "socat",
          "tcp-listen:9222,fork",
          "tcp:localhost:1337"
        ]
      }
    },
    {
      "type": "chrome_open_tabs",
      "parameters": {
        "urls_to_open": [
          "https://lilianweng.github.io/posts/2023-06-23-agent/"
        ]
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "chrome"
  ],
  "evaluator": {
-    "func": "",
+    "func": "compare_pdfs",
    "result": {
      "type": "vm_file",
      "path": "Desktop/LLM Powered Autonomous Agents _ Lil'Log.pdf",
      "dest": "LLM Powered Autonomous Agents _ Lil'Log.pdf"
    },
    "expected": {
      "type": "pdf_from_url",
      "path": "https://lilianweng.github.io/posts/2023-06-23-agent/",
      "dest": "LLM Powered Autonomous Agents _ Lil'Log_gold.pdf"
    }
  }
 }
--- a/evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4.json
+++ b/evaluation_examples/examples/gimp/77b8ab4d-994f-43ac-8930-8ca087d7c4b4.json
--- a/evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce.json
+++ b/evaluation_examples/examples/gimp/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce.json
--- a/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json
+++ b/evaluation_examples/examples/libreoffice_impress/455d3c66-7dc6-4537-a39a-36d3e9119df7.json
@@ -1,12 +1,34 @@
 {
  "id": "455d3c66-7dc6-4537-a39a-36d3e9119df7",
  "snapshot": "libreoffice_impress",
-  "instruction": "Could you help me export impress file to image jpg file?",
+  "instruction": "Could you help me export an Impress file to a .jpg image file and save it as res.jpg on the Desktop? ",
  "source": "https://stackoverflow.com/questions/75626383/how-export-libreoffice-impress-to-image",
-  "config": [],
+  "config": [
    {
      "type": "download",
      "parameters": {
        "files": [
          {
            "url": "https://drive.usercontent.google.com/download?id=12MxMjw28_t1nTLihlDpToKebjsSDsjwx&export=download&authuser=0&confirm=t&uuid=1ccc1da0-d7c7-494f-a0e3-59eb55f54e3f&at=APZUnTXvNIRMsF2cjZuFxmQzByhC:1705253210291",
            "path": "Desktop/wssf-project-plan-on-a-page.pptx"
          }
        ]
      }
    },
    {
      "type": "open",
      "parameters": {
        "path": "Desktop/wssf-project-plan-on-a-page.pptx"
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
-    ""
+    "libreoffice_impress"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
-}
+    "func": "check_file_exists",
    "file_name": "res.png",
    "directory": "/home/user/Desktop/"
  }
 }
--- a/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json
+++ b/evaluation_examples/examples/libreoffice_impress/550ce7e7-747b-495f-b122-acdc4d0b8e54.json
@@ -1,12 +1,37 @@
 {
  "id": "550ce7e7-747b-495f-b122-acdc4d0b8e54",
  "snapshot": "libreoffice_impress",
-  "instruction": "Could you help me add a strike-through on this text",
+  "instruction": "I am checking our soccer club's to-do list for the last semester and adding strike-through sign on the line we have already accomplished. Could you help me add a strike-through on the first and second line?",
  "source": "https://superuser.com/questions/1211035/libreoffice-impress-animations-how-to-strikethrough-on-click?rq=1",
-  "config": [],
+  "config": [
    {
      "type": "download",
      "parameters": {
        "files": [
          {
            "url": "https://drive.usercontent.google.com/download?id=1fw0baVZ15s0r1WGEBftgED2H0ljZgYtu&export=download&authuser=0&confirm=t&uuid=df03788a-81ef-4e55-b33a-2fba7ab28cb8&at=APZUnTXPb-sm88KNwmNeugbhPrzx:17052529805399",
            "path": "Desktop/New_Club_Spring_2018_Training.pptx"
          }
        ]
      }
    },
    {
      "type": "open",
      "parameters": {
        "path": "Desktop/New_Club_Spring_2018_Training.pptx"
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
-    ""
+    "libreoffice_impress"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
-}
+    "func": "check_for_two_lines",
    "result": {
      "type": "vm_file",
      "path": "Desktop/New_Club_Spring_2018_Training.pptx",
      "dest": "New_Club_Spring_2018_Training.pptx"
    }
  }
 }
--- a/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json
+++ b/evaluation_examples/examples/libreoffice_impress/5d901039-a89c-4bfb-967b-bf66f4df075e.json
@@ -1,12 +1,42 @@
 {
  "id": "5d901039-a89c-4bfb-967b-bf66f4df075e",
  "snapshot": "libreoffice_impress",
-  "instruction": "Help me stretch the image to fill the entire page, keeping its proportion and centering the image",
+  "instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.",
  "source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag",
-  "config": [],
+  "config": [
    {
      "type": "download",
      "parameters": {
        "files": [
          {
            "url": "https://drive.usercontent.google.com/download?id=16K6TpGIRZpqOJUu-mtJQ_78kIwLcn-4D&export=download&authuser=0&confirm=t&uuid=945b6f33-53d2-4e87-ada9-efa8b938a499&at=APZUnTVw4fKyJPW0vAAJURruAJIP:1705250184439",
            "path": "Desktop/CPD_Background_Investigation_Process.pptx"
          }
        ]
      }
    },
    {
      "type": "open",
      "parameters": {
        "path": "Desktop/CPD_Background_Investigation_Process.pptx"
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
-    ""
+    "libreoffice_impress"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
-}
+    "func": "compare_pptx_files",
    "expected": {
      "type": "cloud_file",
      "path": "https://drive.usercontent.google.com/download?id=1rsvFPyHYiIPh1c8Nj8say0NJCG2VIDr7&export=download&authuser=0&confirm=t&uuid=aac08a92-6595-47d8-84dc-8f1ab1df987f&at=APZUnTXIWCn5B0CpLttvG2bsr_a7:1705250423565",
      "dest": "CPD_Background_Investigation_Process_Gold.docx"
    },
    "result": {
      "type": "vm_file",
      "path": "Desktop/CPD_Background_Investigation_Process.pptx",
      "dest": "CPD_Background_Investigation_Process.pptx"
    }
  }
 }
--- a/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json
+++ b/evaluation_examples/examples/vs_code/0ed39f63-6049-43d4-ba4d-5fa2fe04a951.json
@@ -1,7 +1,7 @@
 {
  "id": "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
  "snapshot": "vscode",
-  "instruction": "Could you help me find and replace \"text\" with \"test\" in this file?",
+  "instruction": "Please change all the places that say \"text\" to \"test\" in this document for me.",
  "source": "https://www.quora.com/How-do-you-find-and-replace-text-in-Visual-Studio-Code",
  "config": [
    {
@@ -16,9 +16,15 @@
      }
    },
    {
-      "type": "open",
+      "type": "launch",
      "parameters": {
-        "path": "Desktop/vscode_replace_text.txt"
+        "command": ["code", "Desktop/vscode_replace_text.txt"]
      }
    },
    {
      "type": "activate_window",
      "parameters": {
        "window_name": "Visual Studio Code"
      }
    }
  ],
--- a/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json
+++ b/evaluation_examples/examples/vs_code/53ad5833-3455-407b-bbc6-45b4c79ab8fb.json
@@ -1,13 +1,50 @@
 {
  "id": "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
  "snapshot": "vscode",
-  "instruction": "Could you help me open the project at /home/user/project?",
+  "instruction": "I'd like the \"project\" in the \"user\" folder under \"home\" to be opened with VS Code, please.",
  "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ",
  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "code"
        ]
      }
    },
    {
      "type": "command",
      "parameters": {
-        "command": ["mkdir", "-p", "/home/user/project"]
+        "command": [
          "mkdir",
          "-p",
          "/home/user/project/.vscode"
        ]
      }
    },
    {
      "type": "download",
      "parameters": {
        "files": [
          {
            "url": "https://drive.usercontent.google.com/download?id=1akdsiRVdq6CUtT-FX8Dpf8ruPTq6DcFn&export=download&authuser=0&confirm=t&uuid=ce2fa96a-454e-43d9-bbe3-98553b7eed0d&at=APZUnTVw_YQ1URTvP34vrmKcw0b4:1705222451052",
            "path": "/home/user/project/main.py"
          },
          {
            "url": "https://drive.usercontent.google.com/download?id=1BkwtqtAzv_K2CrTbJZ0HbMHBffzdD9vc&export=download&authuser=0&confirm=t&uuid=28f77090-deef-49a1-b156-91317881e75e&at=APZUnTXuaR6i_3t3Prslk535GaO5:1705222457290",
            "path": "/home/user/project/README.md"
          },
          {
            "url": "https://drive.usercontent.google.com/download?id=1ea_zF2tbcXOB8w9neBV-U5xI2nnPzIw_&export=download&authuser=0&confirm=t&uuid=9cf8c5bb-a880-475c-b80b-967a0c4fbea4&at=APZUnTUdjIj80F3Mbgi72eZDTZLO:1705222462443",
            "path": "/home/user/project/.vscode/settings.json"
          }
        ]
      }
    },
    {
      "type": "activate_window",
      "parameters": {
        "window_name": "Visual Studio Code"
      }
    }
  ],
@@ -15,5 +52,27 @@
  "related_apps": [
    "vscode"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
    "postconfig": [
      {
        "type": "activate_window",
        "parameters": {
          "window_name": "Visual Studio Code"
        }
      }
    ],
    "func": "compare_config",
    "expected": {
      "type": "rule",
      "rules": {
        "expect": "project"
      }
    },
    "result": {
      "type": "vscode_config",
      "vscode_extension_command": "OpenProject",
      "path": "OpenProject.txt",
      "dest": "OpenProject.txt"
    }
  }
 }
--- a/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json
+++ b/evaluation_examples/examples/vs_code/59ed65c7-e9a6-43db-833f-76d6730c0004.json
@@ -1,12 +1,50 @@
 {
  "id": "59ed65c7-e9a6-43db-833f-76d6730c0004",
  "snapshot": "vscode",
-  "instruction": "Could you help me start debugging with the breakpoint at line 15?",
+  "instruction": "Could you help me start debugging with the breakpoint at line 100?",
  "source": "https://www.youtube.com/watch?v=7qZBwhSlfOo",
-  "config": [],
+  "config": [
    {
      "type": "download",
      "parameters": {
        "files": [
          {
            "url": "https://drive.usercontent.google.com/download?id=1eLlB7UqRjh55vm0SIxb96aU1WbbK3H3T&export=download&authuser=0&confirm=t&uuid=379d1cbf-cca1-454a-a5a6-c5389024f728&at=APZUnTWn4vJZhfvrdfYZ6byVfaSj:1705159150342",
            "path": "Desktop/main.py"
          }
        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
        "command": ["code", "Desktop/main.py"]
      }
    },
    {
      "type": "activate_window",
      "parameters": {
        "window_name": "Visual Studio Code"
      }
    }
  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "vscode"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
    "func": "compare_config",
    "expected": {
      "type": "rule",
      "rules": {
        "expect": "100"
      }
    },
    "result": {
      "type": "vscode_config",
      "vscode_extension_command": "GetBreakPoint",
      "path": "GetBreakPoint.txt",
      "dest": "GetBreakPoint.txt"
    }
  }
 }
--- a/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json
+++ b/evaluation_examples/examples/vs_code/982d12a5-beab-424f-8d38-d2a48429e511.json
@@ -3,10 +3,39 @@
  "snapshot": "vscode",
  "instruction": "Could you help me change the color theme to Dark?",
  "source": "https://www.youtube.com/watch?v=ORrELERGIHs",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "code"
        ]
      }
    },
    {
      "type": "activate_window",
      "parameters": {
        "window_name": "Visual Studio Code"
      }
    }
  ],
  "trajectory": "trajectories/982d12a5-beab-424f-8d38-d2a48429e511",
  "related_apps": [
    "vscode"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
    "func": "compare_config",
    "expected": {
      "type": "rule",
      "rules": {
        "expect": "2"
      }
    },
    "result": {
      "type": "vscode_config",
      "vscode_extension_command": "GetColorTheme",
      "path": "GetColorTheme.txt",
      "dest": "GetColorTheme.txt"
    }
  }
 }
--- a/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json
+++ b/evaluation_examples/examples/vs_code/eabc805a-bfcf-4460-b250-ac92135819f6.json
@@ -3,20 +3,44 @@
  "snapshot": "vscode",
  "instruction": "Help me install the extension Python.",
  "source": "https://www.youtube.com/watch?v=VqCgcpAypFQ",
-  "config": [],
+  "config": [
    {
      "type": "launch",
      "parameters": {
        "command": [
          "code"
        ]
      }
    },
    {
      "type": "activate_window",
      "parameters": {
        "window_name": "Visual Studio Code"
      }
    }
  ],
  "trajectory": "trajectories/eabc805a-bfcf-4460-b250-ac92135819f6",
  "related_apps": [
    "vscode"
  ],
  "evaluator": {
-    "func": "compare_answer",
+    "func": "is_extension_installed",
    "expected": {
      "type": "string",
      "string": "ms-python.python\n"
    },
    "result": {
-      "type": "command_line",
+      "type": "vm_command_line",
-      "command": "code --list-extensions | grep ms-python.python"
+      "command": [
        "code",
        "--list-extensions",
        "|",
        "grep",
        "ms-python.python"
      ]
    },
    "expected": {
      "type": "rule",
      "rules": {
        "type": "contain",
        "expected": "ms-python.python"
      }
    }
  }
 }
--- a/experiment.py
+++ b/experiment.py
@@ -0,0 +1,104 @@
 import datetime
 import json
 import logging
 import os
 import sys
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.gpt_4v_agent import GPT4v_Agent
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)
 datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
 debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
 stdout_handler = logging.StreamHandler(sys.stdout)
 sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
 file_handler.setLevel(logging.INFO)
 debug_handler.setLevel(logging.DEBUG)
 stdout_handler.setLevel(logging.INFO)
 sdebug_handler.setLevel(logging.DEBUG)
 formatter = logging.Formatter(
    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
 file_handler.setFormatter(formatter)
 debug_handler.setFormatter(formatter)
 stdout_handler.setFormatter(formatter)
 sdebug_handler.setFormatter(formatter)
 stdout_handler.addFilter(logging.Filter("desktopenv"))
 sdebug_handler.addFilter(logging.Filter("desktopenv"))
 logger.addHandler(file_handler)
 logger.addHandler(debug_handler)
 logger.addHandler(stdout_handler)
 logger.addHandler(sdebug_handler)
 #  }}} Logger Configs # 
 logger = logging.getLogger("desktopenv.experiment")
 PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 def run_one_example(example, agent, max_steps=20, example_trajectory_dir="exp_trajectory"):
    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
    env = DesktopEnv(
        path_to_vm=PATH_TO_VM,
        action_space=agent.action_space,
        task_config=example
    )
    # reset the environment to certain snapshot
    observation = env.reset()
    observation['instruction'] = example['instruction']
    done = False
    step_num = 0
    # todo: save the screenshots and actions to a folder
    while not done and step_num < max_steps:
        actions = agent.predict(observation)
        for action in actions:
            observation, reward, done, info = env.step(action)
            observation['instruction'] = example['instruction']
        step_num += 1
        logger.info("Step %d", step_num)
        logger.info("Action: %s", actions)
        observation.pop("accessibility_tree")
        logger.info("Observation: %s", observation)
        logger.info("Reward: %.2f", reward)
        logger.info("Info: %s", info)
        logger.info("================================\n")
        if done:
            logger.info("The episode is done.")
            break
    result = env.evaluate()
    logger.info("Result: %.2f", result)
    # env.close()
    logger.info("Environment closed.")
 if __name__ == "__main__":
    action_space = "pyautogui"
    example_class = "vlc"
    example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e"
    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
        example = json.load(f)
    example["snapshot"] = "chrome_setup"
    api_key = os.environ.get("OPENAI_API_KEY")
    agent = GPT4v_Agent(api_key=api_key, action_space=action_space)
    root_trajectory_dir = "exp_trajectory"
    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
    os.makedirs(example_trajectory_dir, exist_ok=True)
    run_one_example(example, agent, 20, example_trajectory_dir)
--- a/main.py
+++ b/main.py
@@ -1,10 +1,10 @@
 import datetime
 import json
 from desktop_env.envs.desktop_env import DesktopEnv
 import logging
 import os
 import sys
-import datetime
+
 from desktop_env.envs.desktop_env import DesktopEnv
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
@@ -12,17 +12,18 @@ logger.setLevel(logging.DEBUG)
 datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
+file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
+debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
 stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
+sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
 file_handler.setLevel(logging.INFO)
 debug_handler.setLevel(logging.DEBUG)
 stdout_handler.setLevel(logging.INFO)
 sdebug_handler.setLevel(logging.DEBUG)
-formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
+formatter = logging.Formatter(
    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
 file_handler.setFormatter(formatter)
 debug_handler.setFormatter(formatter)
 stdout_handler.setFormatter(formatter)
@@ -39,6 +40,7 @@ logger.addHandler(sdebug_handler)
 logger = logging.getLogger("desktopenv.main")
 def human_agent():
    """
    Runs the Gym environment with human input.
@@ -76,7 +78,8 @@ def human_agent():
        # }
        logger.info(trajectory[i])
-        observation, reward, done, info = env.step(trajectory[i], pause=5)
+        observation, reward, done, info = env.step(trajectory[i])
        observation.pop("accessibility_tree")
        logger.info("Observation: %s", observation)
        logger.info("Reward: %.2f", reward)
        logger.info("Info: %s", info)
@@ -87,12 +90,14 @@ def human_agent():
            logger.info("The episode is done.")
            break
    #input("PAUSING")
    result = env.evaluate()
    logger.info("Result: %.2f", result)
    #input("PAUSING")
-    #env.close()
+    # env.close()
    logger.info("Environment closed.")
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_agent.py
@@ -0,0 +1,84 @@
 from typing import Dict
 import PIL.Image
 import google.generativeai as genai
 from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
 from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 class GeminiPro_Agent:
    def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
        genai.configure(api_key)
        self.model = genai.GenerativeModel(model)
        self.max_tokens = max_tokens
        self.action_space = action_space
        self.trajectory = [
            {
                "role": "system",
                "parts": [
                    {
                        "computer_13": SYS_PROMPT_ACTION,
                        "pyautogui": SYS_PROMPT_CODE
                    }[action_space]
                ]
            }
        ]
    def predict(self, obs: Dict):
        """
        Predict the next action(s) based on the current observation.
        """
        img = PIL.Image.open(obs["screenshot"])
        self.trajectory.append({
            "role": "user",
            "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
                obs["instruction"]), img]
        })
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["parts"][0])
            if len(self.trajectory[i]["parts"]) > 1:
                traj_to_show.append("screenshot_obs")
        print("Trajectory:", traj_to_show)
        response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
        try:
            # fixme: change to fit the new response format from gemini pro
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
        except:
            # todo: add error handling
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None
        return actions
    def parse_actions(self, response: str):
        # response example
        """
        ```json
        {
          "action_type": "CLICK",
          "click_type": "RIGHT"
        }
        ```
        """
        # parse from the response
        if self.action_space == "computer_13":
            actions = parse_actions_from_string(response)
        elif self.action_space == "pyautogui":
            actions = parse_code_from_string(response)
        # add action into the trajectory
        self.trajectory.append({
            "role": "assistant",
            "parts": [response]
        })
        return actions
--- a/mm_agents/gemini_test.py
+++ b/mm_agents/gemini_test.py
@@ -1,19 +0,0 @@
 import PIL.Image
 import google.generativeai as genai
 genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q")
 # for m in genai.list_models():
 #   if 'generateContent' in m.supported_generation_methods:
 #     print(m.name)
 model = genai.GenerativeModel('gemini-pro-vision')
 img = PIL.Image.open('image.jpg')
 messages = [
    {'role':'user',
     'parts': ["Explain this image.", img]}
 ]
 response = model.generate_content(messages)
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,12 +1,12 @@
 # fixme: Need to be rewrite on new action space
 import os
 import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
 import json
 import re
 from typing import Dict
 import requests
-from mm_agents.gpt_4v_prompt import SYS_PROMPT
+
 from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 # Function to encode the image
@@ -47,11 +47,26 @@ def parse_actions_from_string(input_string):
                raise ValueError("Invalid response format: " + input_string)
 def parse_code_from_string(input_string):
    # This regular expression will match both ```code``` and ```python code```
    # and capture the `code` part. It uses a non-greedy match for the content inside.
    pattern = r"```(?:\w+\s+)?(.*?)```"
    # Find all non-overlapping matches in the string
    matches = re.findall(pattern, input_string, re.DOTALL)
    # The regex above captures the content inside the triple backticks.
    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
    # so the code inside backticks can span multiple lines.
    # matches now contains all the captured code snippets
    return matches
 class GPT4v_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
+    def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
        self.instruction = instruction
        self.model = model
        self.max_tokens = max_tokens
        self.action_space = action_space
        self.headers = {
            "Content-Type": "application/json",
@@ -64,20 +79,27 @@ class GPT4v_Agent:
                "content": [
                    {
                        "type": "text",
-                        "text": SYS_PROMPT
+                        "text": {
                            "computer_13": SYS_PROMPT_ACTION,
                            "pyautogui": SYS_PROMPT_CODE
                        }[action_space]
                    },
                ]
            }
        ]
-    def predict(self, obs):
+    def predict(self, obs: Dict):
-        base64_image = encode_image(obs)
+        """
        Predict the next action(s) based on the current observation.
        """
        base64_image = encode_image(obs["screenshot"])
        self.trajectory.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
-                    "text": "What's the next step for instruction '{}'?".format(self.instruction)
+                    "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
                        obs["instruction"])
                },
                {
                    "type": "image_url",
@@ -87,12 +109,15 @@ class GPT4v_Agent:
                }
            ]
        })
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
            if len(self.trajectory[i]["content"]) > 1:
                traj_to_show.append("screenshot_obs")
        print("Trajectory:", traj_to_show)
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -103,6 +128,7 @@ class GPT4v_Agent:
        try:
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
        except:
            # todo: add error handling
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None
@@ -120,7 +146,10 @@ class GPT4v_Agent:
        """
        # parse from the response
-        actions = parse_actions_from_string(response)
+        if self.action_space == "computer_13":
            actions = parse_actions_from_string(response)
        elif self.action_space == "pyautogui":
            actions = parse_code_from_string(response)
        # add action into the trajectory
        self.trajectory.append({
@@ -133,34 +162,4 @@ class GPT4v_Agent:
            ]
        })
-        # parse action
+        return actions
        parsed_actions = []
        for action in actions:
            parsed_action = {}
            action_type = Action[action['action_type']].value
            parsed_action["action_type"] = action_type
            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
                parsed_action["click_type"] = MouseClick[action['click_type']].value
            if action_type == Action.MOUSE_MOVE.value:
                parsed_action["x"] = action["x"]
                parsed_action["y"] = action["y"]
            if action_type == Action.KEY.value:
                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys
            if action_type == Action.TYPE.value:
                parsed_action["text"] = action["text"]
            parsed_actions.append(parsed_action)
        return parsed_actions
 if __name__ == '__main__':
    # OpenAI API Key
    api_key = os.environ.get("OPENAI_API_KEY")
    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
    print(agent.predict(obs="stackoverflow.png"))
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.txt
@@ -1,52 +0,0 @@
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
 Firstly you need to predict the class of your action, select from one below:
 - **MOUSE_MOVE**: move the mouse to a specific position
 - **CLICK**: click on the screen
 - **MOUSE_DOWN**: press the mouse button
 - **MOUSE_UP**: release the mouse button
 - **KEY**: press a key on the keyboard
 - **KEY_DOWN**: press a key on the keyboard
 - **KEY_UP**: release a key on the keyboard
 - **TYPE**: type a string on the keyboard
 Then you need to predict the parameters of your action:
 - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
 for example, format as:
 ```
 {
  "action_type": "MOUSE_MOVE",
  "x": 1319.11,
  "y": 65.06
 }
 ```
 - For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
 for example, format as:
 ```
 {
  "action_type": "CLICK",
  "click_type": "LEFT"
 }
 ```
 - For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
 for example, format as:
 ```
 {
  "action_type": "TYPE",
  "text": [
    "w",
    "i",
    "k",
    "i",
    "p",
    "e",
    "d",
    "i",
    "a"
  ]
 }
 ```
 For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -1,19 +1,207 @@
 SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
-Firstly you need to predict the class of your action, select from one below:
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
- **MOUSE_MOVE**: move the mouse to a specific position
+ACTION_SPACE = [
- **CLICK**: click on the screen
+    {
- **MOUSE_DOWN**: press the mouse button
+        "action_type": "MOVE_TO",
- **MOUSE_UP**: release the mouse button
+        "note": "move the cursor to the specified position",
- **KEY**: press a key on the keyboard
+        "parameters": {
- **KEY_DOWN**: press a key on the keyboard
+            "x": {
- **KEY_UP**: release a key on the keyboard
+                "type": float,
- **TYPE**: type a string on the keyboard
+                "range": [0, X_MAX],
-
+                "optional": False,
-Then you need to predict the parameters of your action:
+            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "CLICK",
        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            },
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            },
            "num_clicks": {
                "type": int,
                "range": [1, 2, 3],
                "optional": True,
            },
        }
    },
    {
        "action_type": "MOUSE_DOWN",
        "note": "press the left button if the button not specified, otherwise press the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "MOUSE_UP",
        "note": "release the left button if the button not specified, otherwise release the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "RIGHT_CLICK",
        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DOUBLE_CLICK",
        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DRAG_TO",
        "note": "drag the cursor to the specified position with the left button pressed",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": False,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "SCROLL",
        "note": "scroll the mouse wheel up or down",
        "parameters": {
            "dx": {
                "type": int,
                "range": None,
                "optional": False,
            },
            "dy": {
                "type": int,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "TYPING",
        "note": "type the specified text",
        "parameters": {
            "text": {
                "type": str,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "PRESS",
        "note": "press the specified key and release it",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_DOWN",
        "note": "press the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_UP",
        "note": "release the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "HOTKEY",
        "note": "press the specified key combination",
        "parameters": {
            "keys": {
                "type": list,
                "range": [KEYBOARD_KEYS],
                "optional": False,
            }
        }
    },
    ############################################################################################################
    {
        "action_type": "WAIT",
        "note": "wait until the next action",
    },
    {
        "action_type": "FAIL",
        "note": "decide the task can not be performed",
    },
    {
        "action_type": "DONE",
        "note": "decide the task is done",
    }
 ]
 Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
 - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
@@ -48,7 +236,9 @@ for example, format as:
 }
 ```
-For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+REMEMBER:
-You can predict multiple actions at one step, but you should only return one action for each step.
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. 
 You MUST wrap the dict with backticks (\`).
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 You CAN predict multiple actions at one step, but you should only return one action for each step.
 """
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -4,5 +4,8 @@ For each step, you will get an observation of an image, which is the screenshot
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
-Return `None` if you cannot perform the action.
+
 When you think you have to wait for some time, return `WAIT`.
 When you think the task can not be done, return `FAIL`.
 When you think the task is done, return `DONE`.
 """
--- a/requirements.txt
+++ b/requirements.txt
@@ -29,3 +29,4 @@ opencv-python
 ImageHash
 scikit-image
 librosa
 pymupdf