Merge branch 'main' into zdy

2024-03-15 12:12:40 +08:00
parent 587a89fa7f f6b96165e2
commit 4d69f61a1c
189 changed files with 4679 additions and 9135 deletions
--- a/README.md
+++ b/README.md
@@ -1,5 +1,8 @@
-# DesktopEnv: An Environment towards Human-like Computer Task Mastery
+# OSWorld: Open-Ended Tasks in Real Computer Environments
+
 <p align="center">
+    <img src="desktop_env/assets/icon.jpg" alt="Logo" width="80px">
+    <br>
    <b>SLOGAN</b>
 </p>

@@ -8,7 +11,7 @@
 <a href="">Paper</a>
 </p>

-![Overview](media/overview.png)
+![Overview]()

 ## Updates
 - 2024-03-01: We released our [paper](), [environment code](), [dataset](), and [project page](). Check it out!
--- a/desktop_env/assets/icon.jpg
+++ b/desktop_env/assets/icon.jpg
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -53,8 +53,8 @@ class DesktopEnv(gym.Env):
    def __init__(
            self,
            path_to_vm: str,
+            snapshot_name: str = "init_state",
            action_space: str = "computer_13",
-            task_config: Dict[str, Any] = None,
            tmp_dir: str = "tmp",
            cache_dir: str = "cache",
            screen_size: Tuple[int] = (1920, 1080),
@@ -64,15 +64,6 @@ class DesktopEnv(gym.Env):
        Args:
            path_to_vm (str): path to .vmx file
            action_space (str): "computer_13" | "pyautogui"
-
-            task_config (Dict[str, Any]): manages task configs integratedly,
-              including
-              * base snapshot
-              * task id (uuid)
-              * instruction
-              * setup config
-              * evaluator config
-
            tmp_dir (str): temporary directory to store trajectory stuffs like
              the extracted screenshots
            cache_dir (str): cache directory to cache task-related stuffs like
@@ -81,23 +72,20 @@ class DesktopEnv(gym.Env):

        # Initialize environment variables
        self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm)))
+        self.snapshot_name = snapshot_name
        self.tmp_dir_base: str = tmp_dir
        self.cache_dir_base: str = cache_dir
-        self.vm_screen_size = screen_size
+        self.vm_screen_size = screen_size  # todo: add the logic to get the screen size from the VM
        self.headless = headless

        os.makedirs(self.tmp_dir_base, exist_ok=True)

-        # task-aware stuffs
-        # todo: handling the logic of snapshot directory
-        self._set_task_info(task_config)
-
        # Initialize emulator and controller
        logger.info("Initializing...")
        self._start_emulator()
        self.vm_ip = self._get_vm_ip()
        self.controller = PythonController(vm_ip=self.vm_ip)
-        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir)
+        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir_base)

        # Meta info of the VM, move to the reset() function
        self.vm_platform: str = ""  # self.controller.get_vm_platform()
@@ -147,7 +135,7 @@ class DesktopEnv(gym.Env):
        raise Exception("Failed to get VM IP address!")

    def _save_state(self):
-        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_name])

    def _get_screenshot(self):
        # random_uuid = str(uuid.uuid4())
@@ -167,7 +155,6 @@ class DesktopEnv(gym.Env):
        return screenshot_image_path

    def _set_task_info(self, task_config: Dict[str, Any]):
-        self.snapshot_path = task_config["snapshot"]
        self.task_id: str = task_config["id"]
        self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
        os.makedirs(self.cache_dir, exist_ok=True)
@@ -239,8 +226,8 @@ class DesktopEnv(gym.Env):
        )
        os.makedirs(os.path.join(self.tmp_dir, "screenshots"))

-        logger.info("Reverting to snapshot to {}...".format(self.snapshot_path))
-        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        logger.info("Reverting to snapshot to {}...".format(self.snapshot_name))
+        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_name])
        time.sleep(5)

        print(self.vm_screen_size)
--- a/desktop_env/evaluators/getters/init.py
+++ b/desktop_env/evaluators/getters/init.py
@@ -24,14 +24,16 @@ from .chrome import (
    get_gotoRecreationPage_and_get_html_content,
    get_url_dashPart,
    get_active_url_from_accessTree,
+    get_find_installed_extension_name,
    get_info_from_website
 )
 from .file import get_cloud_file, get_vm_file, get_cache_file, get_content_from_vm_file
-from .general import get_vm_command_line, get_vm_terminal_output
+from .general import get_vm_command_line, get_vm_terminal_output, get_vm_command_error
 from .gimp import get_gimp_config_file
-from .impress import get_audio_in_slide
+from .impress import get_audio_in_slide, get_background_image_in_slide
 from .info import get_vm_screen_size, get_vm_window_size, get_vm_wallpaper, get_list_directory
-from .misc import get_rule, get_accessibility_tree, get_rule_relativeTime
+from .misc import get_rule, get_accessibility_tree, get_rule_relativeTime, get_time_diff_range
 from .replay import get_replay
 from .vlc import get_vlc_playing_info, get_vlc_config, get_default_video_player
 from .vscode import get_vscode_config
+from .calc import get_conference_city_in_order
--- a/desktop_env/evaluators/getters/calc.py
+++ b/desktop_env/evaluators/getters/calc.py
@@ -0,0 +1,15 @@
+import csv
+
+# I want to write a function, reads a csv file, and get all the contents in the third column in the order of rows
+def get_conference_city_in_order(env, config):
+    # read the csv file
+    csv_path = config['csv_path']
+    print(f"Reading csv file from {csv_path}")
+    with open(csv_path, 'r') as f:
+        reader = csv.reader(f)
+    # skip the header row
+    next(reader)
+    # get the third column in the order of rows
+    conference_city_list = [row[2] for row in reader]
+    return conference_city_list
+    
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -4,6 +4,7 @@ import os
 import platform
 import sqlite3
 import time
+from urllib.parse import unquote
 from typing import Dict, Any, List
 from urllib.parse import urlparse, parse_qs

@@ -81,34 +82,28 @@ def get_info_from_website(env, config: Dict[Any, Any]) -> Any:
                    page.wait_for_load_state('load')
                action = info_dict.get('action', 'inner_text')
                if action == "inner_text":
-                    ele = page.locator(info_dict['selector'])
-                    expect(ele).to_be_visible()
+                    ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
                    infos.append(ele.inner_text())
                elif action == "attribute":
-                    ele = page.locator(info_dict['selector'])
-                    expect(ele).to_be_visible()
+                    ele = page.wait_for_selector(info_dict['selector'], state='attached', timeout=10000)
                    infos.append(ele.get_attribute(info_dict['attribute']))
                elif action == 'click_and_inner_text':
                    for idx, sel in enumerate(info_dict['selector']):
                        if idx != len(info_dict['selector']) - 1:
-                            link = page.locator(sel)
-                            expect(link).to_be_visible()
+                            link = page.wait_for_selector(sel, state='attached', timeout=10000)
                            link.click()
                            page.wait_for_load_state('load')
                        else:
-                            ele = page.locator(sel)
-                            expect(ele).to_be_visible()
+                            ele = page.wait_for_selector(sel, state='attached', timeout=10000)
                            infos.append(ele.inner_text())
                elif action == 'click_and_attribute':
                    for idx, sel in enumerate(info_dict['selector']):
                        if idx != len(info_dict['selector']) - 1:
-                            link = page.locator(sel)
-                            expect(link).to_be_visible()
+                            link = page.wait_for_selector(sel, state='attached', timeout=10000)
                            link.click()
                            page.wait_for_load_state('load')
                        else:
-                            ele = page.locator(sel)
-                            expect(ele).to_be_visible()
+                            ele = page.wait_for_selector(sel, state='attached')
                            infos.append(ele.get_attribute(info_dict['attribute']))
                else:
                    raise NotImplementedError(f'The action {action} is not supported yet.')
@@ -589,6 +584,10 @@ def get_active_url_from_accessTree(env, config):
    if len(elements) == 0:
        print("no elements found")
        return None
+    elif elements[-1].text is None:
+        print("no text found")
+        return None
+
    active_tab_url = config["goto_prefix"] + elements[0].text if "goto_prefix" in config.keys() else "https://" + \
                                                                                                     elements[0].text
    print("active tab url now: {}".format(active_tab_url))
@@ -1006,6 +1005,43 @@ def get_find_unpacked_extension_path(env, config: Dict[str, str]):
        return "Google"


+def get_find_installed_extension_name(env, config: Dict[str, str]):
+    os_type = env.vm_platform
+    if os_type == 'Windows':
+        preference_file_path = env.controller.execute_python_command("""import os; print(os.path.join(os.getenv('LOCALAPPDATA'),
+                                            'Google\\Chrome\\User Data\\Default\\Preferences'))""")['output'].strip()
+    elif os_type == 'Darwin':
+        preference_file_path = env.controller.execute_python_command(
+            "import os; print(os.path.join(os.getenv('HOME'), 'Library/Application Support/Google/Chrome/Default/Preferences'))")[
+            'output'].strip()
+    elif os_type == 'Linux':
+        if "arm" in platform.machine():
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), 'snap/chromium/common/chromium/Default/Preferences'))")[
+                'output'].strip()
+        else:
+            preference_file_path = env.controller.execute_python_command(
+                "import os; print(os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences'))")[
+                'output'].strip()
+
+    else:
+        raise Exception('Unsupported operating system')
+
+    try:
+        content = env.controller.get_file(preference_file_path)
+        data = json.loads(content)
+        # Preferences store all the path of installed extensions, return them all and let metrics try to find one matches the targeted extension path
+        all_extensions_name = []
+        all_extensions = data.get('extensions', {}).get('settings', {})
+        for id in all_extensions.keys():
+            name = all_extensions[id]["manifest"]["name"]
+            all_extensions_name.append(name)
+        return all_extensions_name
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return "Google"
+
+
 def get_data_delete_automacally(env, config: Dict[str, str]):
    """
    This function is used to open th "auto-delete" mode of chromium
@@ -1033,8 +1069,8 @@ def get_data_delete_automacally(env, config: Dict[str, str]):
    try:
        content = env.controller.get_file(preference_file_path)
        data = json.loads(content)
-        data_delete_state = data["profile"]["exit_type"]
-        return data_delete_state
+        data_delete_state = data["profile"].get("default_content_setting_values", None)
+        return "true" if data_delete_state is not None else "false"
    except Exception as e:
        logger.error(f"Error: {e}")
        return "Google"
@@ -1073,6 +1109,7 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]):
    """
    active_tab_url = get_active_url_from_accessTree(env, config)
    if not isinstance(active_tab_url, str):
+        logger.error("active_tab_url is not a string")
        return None
    host = env.vm_ip
    port = 9222  # fixme: this port is hard-coded, need to be changed from config file
@@ -1105,12 +1142,14 @@ def get_active_tab_html_parse(env, config: Dict[str, Any]):
        for context in browser.contexts:
            for page in context.pages:
                page.wait_for_load_state("networkidle")
-                if page.url == active_tab_url:
+                # the accTree and playwright can get encoding(percent-encoding) characters, we need to convert them to normal characters
+                if unquote(page.url) == unquote(active_tab_url):
                    target_page = page
-                    print("tartget page url: ", target_page.url)
-                    print("tartget page title: ", target_page.title())
+                    print("\33[32mtartget page url: ", target_page.url, "\33[0m")
+                    print("\33[32mtartget page title: ", target_page.title(), "\33[0m")
                    break
        if target_page is None:
+            logger.error("Your tab is not the target tab.")
            return {}
        return_json = {}
        if config["category"] == "class":
--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -1,6 +1,7 @@
 import os
 from typing import Dict, List, Set
 from typing import Optional, Any, Union
+from datetime import datetime
 import requests
 import pandas as pd

@@ -77,21 +78,31 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
        gives (List[int]): optional. defaults to [0]. which files are directly
          returned to the metric. if len==1, str is returned; else, list is
          returned.
+        only support for single file now:
+        time_suffix(bool): optional. defaults to False. if True, append the current time in required format.
+        time_format(str): optional. defaults to "%Y_%m_%d". format of the time suffix.
    """
-
+    time_format = "%Y_%m_%d"
    if not config.get("multi", False):
        paths: List[str] = [config["path"]]
        dests: List[str] = [config["dest"]]
+        if "time_suffix" in config.keys() and config["time_suffix"]:
+            if "time_format" in config.keys():
+                time_format = config["time_format"]
+            # Insert time before . in file type suffix
+            paths = [p.split(".")[0] + datetime.now().strftime(time_format) + "." + p.split(".")[1] if "." in p else p for p in paths]
+            dests = [d.split(".")[0] + datetime.now().strftime(time_format) + "." + d.split(".")[1] if "." in d else d for d in dests]
    else:
        paths: List[str] = config["path"]
        dests: List[str] = config["dest"]
+
+
    cache_paths: List[str] = []

    gives: Set[int] = set(config.get("gives", [0]))

    for i, (p, d) in enumerate(zip(paths, dests)):
        _path = os.path.join(env.cache_dir, d)
-
        file = env.controller.get_file(p)
        if file is None:
            #return None
@@ -104,7 +115,6 @@ def get_vm_file(env, config: Dict[str, Any]) -> Union[Optional[str], List[Option
            cache_paths.append(_path)
        with open(_path, "wb") as f:
            f.write(file)
-
    return cache_paths[0] if len(cache_paths)==1 else cache_paths


--- a/desktop_env/evaluators/getters/general.py
+++ b/desktop_env/evaluators/getters/general.py
@@ -21,6 +21,22 @@ def get_vm_command_line(env, config: Dict[str, str]):
        logger.error("Failed to get vm command line. Status code: %d", response.status_code)
        return None

+def get_vm_command_error(env, config: Dict[str, str]):
+    vm_ip = env.vm_ip
+    port = 5000
+    command = config["command"]
+    shell = config.get("shell", False)
+
+    response = requests.post(f"http://{vm_ip}:{port}/execute", json={"command": command, "shell": shell})
+
+    print(response.json())
+
+    if response.status_code == 200:
+        return response.json()["error"]
+    else:
+        logger.error("Failed to get vm command line error. Status code: %d", response.status_code)
+        return None
+

 def get_vm_terminal_output(env, config: Dict[str, str]):
    return env.controller.get_terminal_output()
--- a/desktop_env/evaluators/getters/impress.py
+++ b/desktop_env/evaluators/getters/impress.py
@@ -7,6 +7,67 @@ from typing import Dict
 from desktop_env.evaluators.getters.file import get_vm_file


+def get_background_image_in_slide(env, config: Dict[str, str]):
+    ppt_file_path, slide_index, dest = config["ppt_file_path"], int(config["slide_index"]), config["dest"]
+    image_id, image_file_path = None, None
+
+    ppt_file_localhost_path = get_vm_file(env, {"path": ppt_file_path, "dest": os.path.split(ppt_file_path)[-1]})
+
+    with zipfile.ZipFile(ppt_file_localhost_path, 'r') as myzip:
+        slide1_xml_file = 'ppt/slides/slide{}.xml'.format(slide_index + 1)
+        # firstly, check whether the background image is used in the slide
+        if slide1_xml_file not in myzip.namelist(): return None
+        with myzip.open(slide1_xml_file) as f:
+            # Parse the XML tree from the relationships file
+            tree = ET.parse(f)
+            root = tree.getroot()
+            bg_tag = "{http://schemas.openxmlformats.org/presentationml/2006/main}bgPr"
+            image_tag = "{http://schemas.openxmlformats.org/drawingml/2006/main}blip"
+            attr_tag = "{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed"
+            for child in root.iter(bg_tag):
+                try:
+                    for element in child.iter(image_tag):
+                        image_id = element.attrib[attr_tag]
+                        break
+                except: pass
+                if image_id is not None: break
+            else: return None
+
+        # next, extract the background image from the slide
+        slide1_rels_file = 'ppt/slides/_rels/slide{}.xml.rels'.format(slide_index + 1)
+        if slide1_rels_file in myzip.namelist():
+            with myzip.open(slide1_rels_file) as f:
+                # Parse the XML tree from the relationships file
+                tree = ET.parse(f)
+                root = tree.getroot()
+                # Define the namespace used in the relationships file
+                namespaces = {'r': 'http://schemas.openxmlformats.org/package/2006/relationships'}
+                # Look for all relationship elements that have a type attribute for image
+                for rel in root.findall('r:Relationship', namespaces):
+                    # Check if the relationship is for an image file
+                    if 'image' in rel.attrib['Type'] and rel.attrib['Id'] == image_id:
+                        target = rel.attrib['Target']
+                        if target.startswith('..'):
+                            # Resolve the relative path to get the correct path within the zip file
+                            image_file_path = os.path.normpath(os.path.join('ppt/slides', target))
+                            # Replace backslashes with forward slashes for ZIP compatibility
+                            image_file_path = image_file_path.replace('\\', '/')
+                            tmpdirname = os.path.dirname(ppt_file_localhost_path)
+                            myzip.extract(image_file_path, tmpdirname)
+                            image_file_path = os.path.join(tmpdirname, image_file_path)
+                            return image_file_path
+                        else: # absolute path
+                            assert target.startswith("file://"), target
+                            image_file_path = target[7:]
+                        break
+    if image_file_path is None:
+        return None
+
+    else:
+        # Get the audio file from vm and return the file path in the host
+        return get_vm_file(env, {"path": image_file_path, "dest": dest})
+
+
 def get_audio_in_slide(env, config: Dict[str, str]):
    ppt_file_path, slide_index, dest = config["ppt_file_path"], int(config["slide_index"]), config["dest"]

--- a/desktop_env/evaluators/getters/misc.py
+++ b/desktop_env/evaluators/getters/misc.py
@@ -195,3 +195,10 @@ def get_accessibility_tree(env, *args) -> str:
    accessibility_tree: str = env.controller.get_accessibility_tree()
    logger.debug("AT@eval: %s", accessibility_tree)
    return accessibility_tree
+
+def get_time_diff_range(env, config) -> str:
+    try:
+        return config["diff_range_in_minutes"]
+    except:
+        logger.error("diff_range_in_minutes not found in config.")
+        return None
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -2,7 +2,8 @@ from .basic_os import (
    check_gnome_favorite_apps,
    is_utc_0,
    check_text_enlarged,
-    check_moved_jpgs
+    check_moved_jpgs,
+    is_in_vm_clickboard
 )
 from .chrome import (
    is_expected_tabs,
@@ -19,6 +20,7 @@ from .chrome import (
    is_expected_active_tab,
    is_expected_url_pattern_match,
    is_added_to_steam_cart,
+    is_expected_installed_extensions,
    compare_pdf_images
 )
 from .docs import (
@@ -47,8 +49,10 @@ from .docs import (
    check_file_exists,
    check_tabstops,
    compare_contains_image,
+    compare_docx_files_and_ignore_new_lines,
    compare_docx_images,
-    compare_image_text
+    compare_image_text,
+    compare_references
 )
 from .general import (
    check_csv,
@@ -61,6 +65,14 @@ from .general import (
    fuzzy_match,
    check_include_exclude,
    check_direct_json_object,
+    compare_time_in_speedtest_results,
+    is_included_all_json_objects,
+    is_gold_text_included_in_pdf,
+    check_line_number,
+    file_contains,
+    compare_terminal_and_txt,
+    fuzzy_place_math,
+    compare_python_pure_text,
    diff_text_file,
    literal_match
 )
@@ -68,7 +80,7 @@ from .gimp import (
    check_brightness_decrease_and_structure_sim,
    check_contrast_increase_and_structure_sim,
    check_saturation_increase_and_structure_sim,
-    check_image_size_and_structure_sim,
+    check_image_size,
    check_image_mirror,
    check_palette_and_structure_sim,
    check_textbox_on_leftside,
@@ -81,7 +93,9 @@ from .gimp import (
    increase_saturation,
    decrease_brightness,
    check_file_exists,
-    compare_triangle_positions
+    compare_triangle_positions,
+    check_sharper,
+    check_image_file_size
 )
 from .libreoffice import check_libre_locale
 from .pdf import check_pdf_pages
@@ -100,7 +114,8 @@ from .slides import (
 )
 from .table import (
    compare_table,
-    compare_csv
+    compare_csv,
+    compare_conference_city_in_order
 )
 from .thunderbird import (
    check_thunderbird_prefs,
@@ -125,11 +140,13 @@ from .vscode import (
    compare_text_file,
    compare_config,
    compare_answer,
+    compare_result_files,
    is_extension_installed,
    check_json_settings,
    check_json_keybindings,
    check_python_file_by_test_suite,
    check_python_file_by_gold_file,
+    check_html_background_image,
    compare_zip_files
 )
 from .others import compare_epub, check_mp3_meta
--- a/desktop_env/evaluators/metrics/basic_os.py
+++ b/desktop_env/evaluators/metrics/basic_os.py
@@ -1,6 +1,3 @@
-import subprocess
-
-
 def check_gnome_favorite_apps(apps_str: str, rule):
    # parse the string like "['thunderbird.desktop', 'vim.desktop', 'google-chrome.desktop']"
    # to a list of strings
@@ -56,3 +53,16 @@ def check_moved_jpgs(directory_list, rule):
        return 1
    else:
        return 0
+
+
+def is_in_vm_clickboard(config, terminal_output):
+    print("terminal_output: ")
+    print(terminal_output)
+    print("config: ")
+    print(config)
+    expected_results = config["expected"]
+    # check if terminal_output has expected results
+    if not isinstance(expected_results, list):
+        return 1 if expected_results in terminal_output else 0
+    else:
+        return 1 if all(result in terminal_output for result in expected_results) else 0
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -2,9 +2,9 @@ import logging
 import os
 import re
 import shutil
+from itertools import product
 from typing import Any, Dict, List, Union

-import fitz  # PyMuPDF
 import rapidfuzz.fuzz as fuzz
 from bs4 import BeautifulSoup, Tag

@@ -61,6 +61,21 @@ def is_expected_url_pattern_match(result, rules) -> float:
    return 1.


+def is_expected_installed_extensions(installed_extensions, expected) -> float:
+    print("installed_extensions: ")
+    print(installed_extensions)
+    expected_extensions = expected["expected"]
+
+    # whether the expected extensions are installed
+    set_expected_extensions = set(expected_extensions)
+    set_installed_extensions = set(installed_extensions)
+
+    if set_expected_extensions.issubset(set_installed_extensions):
+        return 1.
+    else:
+        return 0.
+
+
 def is_expected_tabs(open_tabs: List[Dict[str, str]], rule: Dict[str, Any]) -> float:
    """
    Checks if the expected tabs are open in Chrome.
@@ -94,12 +109,24 @@ def is_expected_bookmarks(bookmarks: List[str], rule: Dict[str, Any]) -> float:
    elif rule['type'] == "liked_authors_websites_urls":
        # Check if "liked authors" folder exists
        liked_authors_folder = next((bookmark for bookmark in bookmarks['bookmark_bar']['children'] if
-                                      bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None)
+                                     bookmark['type'] == 'folder' and bookmark['name'] == 'Liked Authors'), None)
        if liked_authors_folder:
            # Check if it contains the specified URLs
            liked_authors_urls = [bookmark['url'] for bookmark in liked_authors_folder['children'] if
                                  bookmark['type'] == 'url']
-            return 1. if set(liked_authors_urls) == set(rule['urls']) else 0.
+
+            urls = rule['urls']
+
+            for idx, url in enumerate(urls):
+                if isinstance(url, str):
+                    urls[idx] = [url]
+
+            combinations = product(*urls)
+
+            for combination in combinations:
+                if set(combination) == set(liked_authors_urls):
+                    return 1.
+            return 0.
        else:
            return 0.
    else:
@@ -140,37 +167,54 @@ def compare_pdfs(pdf1_path: Union[str, List[str]], pdf2_path: Union[str, List[st
            logger.info(f"[ERROR]: unexpected error occurred when comparing PDF files: {e}")
    return score / len(pdf2_path)

+
 import fitz
 from PIL import Image
-from io import BytesIO
+from borb.pdf import Document
+from borb.pdf import PDF
+
+from pathlib import Path
+import typing
+

 def compare_pdf_images(pdf1_path: str, pdf2_path: str, **kwargs) -> float:
+    if not pdf1_path or not pdf2_path:
+        return 0.
+
    def extract_images_from_pdf(pdf_path):
        pdf_document = fitz.open(pdf_path)
        images = []

        for page_number in range(pdf_document.page_count):
            page = pdf_document[page_number]
-            image_list = page.get_images(full=True)
+            pixmap = page.get_pixmap()

-            for img_index, img_info in enumerate(image_list):
-                base_image = pdf_document.extract_image(img_index)
-                image_bytes = base_image["image"]
+            img = Image.frombytes("RGB", [pixmap.width, pixmap.height], pixmap.samples)

-                images.append(BytesIO(image_bytes))
+            images.append(img)

        return images
-    
+
+    def fix_pdf(in_path: Path, out_path: Path) -> None:
+        doc: typing.Optional[Document] = None
+        with open(in_path, "rb") as fh:
+            doc = PDF.loads(fh)
+        with open(out_path, "wb") as fh:
+            PDF.dumps(fh, doc)
+
+    fix_pdf(Path(pdf1_path), Path(pdf1_path))
+    fix_pdf(Path(pdf2_path), Path(pdf2_path))
+
    images1 = extract_images_from_pdf(pdf1_path)
    images2 = extract_images_from_pdf(pdf2_path)

    if len(images1) != len(images2):
        return 0.

-    for i, (img1, img2) in enumerate(zip(images1, images2), 1):
-        if Image.open(img1).tobytes() != Image.open(img2).tobytes():
+    for img1, img2 in zip(images1, images2):
+        if img1.tobytes() != img2.tobytes():
            return 0.
-    
+
    return 1.


@@ -178,7 +222,10 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
    """
    Compare two archives. Note that the files in the archives should be of the same type.
    """
-    if not pred_path: return 0.
+    file_path = kwargs.pop('file_path', '')
+
+    if not pred_path:
+        return 0.
    pred_folder = os.path.splitext(pred_path)[0] + '_pred'
    gold_folder = os.path.splitext(gold_path)[0] + '_gold'

@@ -186,13 +233,16 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
        shutil.rmtree(pred_folder, ignore_errors=True)
    os.makedirs(pred_folder)
    shutil.unpack_archive(pred_path, pred_folder)
+
    if not os.path.exists(gold_folder):  # use cache if exists
        os.makedirs(gold_folder)
        shutil.unpack_archive(gold_path, gold_folder)

-    pred_files = sorted(os.listdir(pred_folder))
-    gold_files = sorted(os.listdir(gold_folder))
-    if pred_files != gold_files: return 0.
+    pred_files = sorted(os.listdir(os.path.join(pred_folder, file_path)))
+    gold_files = sorted(os.listdir(os.path.join(gold_folder, file_path)))
+
+    if pred_files != gold_files:
+        return 0.

    def get_compare_function():
        file_type = kwargs.pop('file_type', 'text')
@@ -228,8 +278,8 @@ def compare_archive(pred_path: str, gold_path: str, **kwargs) -> float:
    score = 0
    compare_function = get_compare_function()
    for f1, f2 in zip(pred_files, gold_files):
-        fp1 = os.path.join(pred_folder, f1)
-        fp2 = os.path.join(gold_folder, f2)
+        fp1 = os.path.join(pred_folder, file_path, f1)
+        fp2 = os.path.join(gold_folder, file_path, f2)
        score += compare_function(fp1, fp2, **kwargs)
    return score / len(pred_files)

--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -3,17 +3,19 @@ import os
 import re
 import xml.etree.ElementTree as ET
 import zipfile
+from io import BytesIO
 from typing import List, Dict, Any

+from PIL import Image
 from docx import Document
 from docx.enum.text import WD_PARAGRAPH_ALIGNMENT, WD_TAB_ALIGNMENT
 from docx.shared import RGBColor
 from odf.opendocument import load
 from odf.text import P
 from odf.text import Span
+from rapidfuzz import fuzz
 from skimage.color import deltaE_ciede2000
 from skimage.color import rgb2lab
-from rapidfuzz import fuzz

 logger = logging.getLogger("desktopenv.metric.docs")

@@ -23,6 +25,9 @@ def find_default_font(config_file_path, rules):
    default_font = None
    expected_font = rules["font_name"]

+    if not config_file_path:
+        return 0
+
    try:
        tree = ET.parse(config_file_path)
        root = tree.getroot()
@@ -42,7 +47,14 @@ def find_default_font(config_file_path, rules):


 def contains_page_break(docx_file):
-    doc = Document(docx_file)
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}

@@ -62,6 +74,9 @@ def compare_docx_files(file1, file2, **options):
    ignore_order = options.get('ignore_order', False)
    content_only = options.get('content_only', False)

+    if not file1 or not file2:
+        return 0
+
    def get_paragraph_texts_odt(document):
        paragraphs = document.getElementsByType(P)
        paragraph_texts = []
@@ -80,16 +95,24 @@ def compare_docx_files(file1, file2, **options):

    # Determine file types and load documents
    if file1.endswith('.docx') and file2.endswith('.docx'):
-        doc1 = Document(file1)
-        doc2 = Document(file2)
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
        doc1_paragraphs = [p.text for p in doc1.paragraphs]
        doc2_paragraphs = [p.text for p in doc2.paragraphs]
        if ignore_order:
            doc1_paragraphs = sorted(doc1_paragraphs)
            doc2_paragraphs = sorted(doc2_paragraphs)
    elif file1.endswith('.odt') and file2.endswith('.odt'):
-        doc1 = load(file1)
-        doc2 = load(file2)
+        try:
+            doc1 = load(file1)
+            doc2 = load(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
        doc1_paragraphs = get_paragraph_texts_odt(doc1)
        doc2_paragraphs = get_paragraph_texts_odt(doc2)
        if ignore_order:
@@ -118,22 +141,36 @@ def compare_docx_files(file1, file2, **options):
        if text1 != text2:
            return 0
    else:
+        print("ignore_blanks=false")
        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            print(doc1_paragraphs)
+            print(doc2_paragraphs)
+            print(len(doc1_paragraphs))
+            print(len(doc2_paragraphs))
            return 0
-
+        print("in compare")
        # Compare each paragraph
        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
            if ignore_case:
                p1, p2 = p1.lower(), p2.lower()
            if p1 != p2:
+                print(p1)
+                print(p2)
                return 0

    return 1


 def compare_init_lines(file1, file2):
-    doc1 = Document(file1)
-    doc2 = Document(file2)
+    if not file1 or not file2:
+        return 0
+
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    doc1_paragraphs = [p.text for p in doc1.paragraphs]
    doc2_paragraphs = [p.text for p in doc2.paragraphs]
@@ -149,8 +186,15 @@ def compare_init_lines(file1, file2):


 def compare_docx_tables(docx_file1, docx_file2):
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # get list of tables in docx
    tables1 = doc1.tables
@@ -173,12 +217,17 @@ def compare_docx_tables(docx_file1, docx_file2):

    return 1

-from io import BytesIO
-from PIL import Image

 def compare_docx_images(docx_file1, docx_file2):
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    def extract_images(doc):
        images = []
@@ -187,7 +236,7 @@ def compare_docx_images(docx_file1, docx_file2):
                img_data = rel.target_part.blob
                images.append(BytesIO(img_data))
        return images
-    
+
    images1 = extract_images(doc1)
    images2 = extract_images(doc2)
    if len(images1) != len(images2):
@@ -197,21 +246,31 @@ def compare_docx_images(docx_file1, docx_file2):
            return 0
    return 1

-import pytesseract

+import easyocr
 def compare_image_text(image_path, rule):
-    img = Image.open(image_path)
-    img_text = pytesseract.image_to_string(img)
+    reader = easyocr.Reader(['en'])
+    result = reader.readtext(image_path)
+    extracted_text = ' '.join([entry[1] for entry in result])
    if rule['type'] == 'text':
-        return 1 if rule['text'] in img_text else 0
+        return 1 if rule['text'] in extracted_text else 0
    else:
        raise ValueError("Unsupported rule type")

+
 def compare_line_spacing(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    if not compare_docx_files(docx_file1, docx_file2):
        return 0
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    if len(doc1.paragraphs) != len(doc2.paragraphs):
        return 0
@@ -229,11 +288,18 @@ def compare_line_spacing(docx_file1, docx_file2):


 def compare_insert_equation(docx_file1, docx_file2):
+    if not docx_file1 or not docx_file2:
+        return 0
+
    if not compare_docx_files(docx_file1, docx_file2):
        return 0

-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Compare each paragraph if it contains equation
    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
@@ -244,7 +310,15 @@ def compare_insert_equation(docx_file1, docx_file2):


 def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
-    doc = Document(docx_file)
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    expected_font = rules["font_name"]

    for paragraph in doc.paragraphs:
@@ -256,8 +330,15 @@ def compare_font_names(docx_file, rules: List[Dict[str, Any]]):


 def compare_subscript_contains(docx_file1, docx_file2):
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
@@ -268,7 +349,14 @@ def compare_subscript_contains(docx_file1, docx_file2):


 def has_page_numbers_in_footers(docx_file):
-    doc = Document(docx_file)
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for section in doc.sections:
        footer = section.footer
@@ -282,7 +370,15 @@ def has_page_numbers_in_footers(docx_file):


 def is_first_line_centered(docx_file):
-    doc = Document(docx_file)
+    if not docx_file:
+        return 0
+
+    try:
+        doc = Document(docx_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    first_paragraph = doc.paragraphs[0]

    # check if the first line is center justified
@@ -290,13 +386,23 @@ def is_first_line_centered(docx_file):


 def check_file_exists(directory, filename):
+    if not directory or not filename:
+        return 0
    file_path = os.path.join(directory, filename)
    return 1 if os.path.isfile(file_path) else 0


 def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
-    doc1: Document = Document(docx_file1)
-    doc2: Document = Document(docx_file2)
+    if not docx_file1 or not docx_file2:
+        return .0
+
+    try:
+        doc1: Document = Document(docx_file1)
+        doc2: Document = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return .0
+
    para1 = [p for p in doc1.paragraphs if p.text.strip()]
    para2 = [p for p in doc2.paragraphs if p.text.strip()]
    if len(para1) != len(para2): return .0
@@ -313,7 +419,7 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:
    section = doc2.sections[0]
    paragraph_width = section.page_width - section.left_margin - section.right_margin
    ignore_tabs = lambda x: x.alignment == WD_TAB_ALIGNMENT.CLEAR or (
-                x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
+            x.alignment == WD_TAB_ALIGNMENT.LEFT and x.position == 0)
    minus = .0
    for p1, p2 in zip(para1, para2):
        # filter CLEAR tabstop and default left-0 tabstop
@@ -330,8 +436,15 @@ def check_tabstops(docx_file1, docx_file2, **kwargs) -> float:


 def compare_contains_image(docx_file1, docx_file2):
-    doc1 = Document(docx_file1)
-    doc2 = Document(docx_file2)
+    if not docx_file1 or not docx_file2:
+        return 0
+
+    try:
+        doc1 = Document(docx_file1)
+        doc2 = Document(docx_file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
        for run1, run2 in zip(para1.runs, para2.runs):
@@ -342,9 +455,18 @@ def compare_contains_image(docx_file1, docx_file2):


 def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0
-    document = Document(file_path1)
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    threshold = kwargs.get('threshold', 3.5)

    def _calculate_color_difference(rgb1, rgb2):
@@ -376,6 +498,9 @@ def evaluate_colored_words_in_tables(file_path1, file_path2, **kwargs):


 def check_highlighted_words(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0

@@ -398,9 +523,17 @@ def check_highlighted_words(file_path1, file_path2):


 def evaluate_strike_through_last_paragraph(file_path1, file_path2):
+    if not file_path1 or not file_path2:
+        return 0
+
    if not compare_docx_files(file_path1, file_path2):
        return 0
-    document = Document(file_path1)
+
+    try:
+        document = Document(file_path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Get the last paragraph
    last_paragraph = document.paragraphs[-1]
@@ -414,7 +547,14 @@ def evaluate_strike_through_last_paragraph(file_path1, file_path2):


 def evaluate_conversion(file_path):
-    document = Document(file_path)
+    if not file_path:
+        return 0
+
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    for table in document.tables:
        for row in table.rows:
@@ -433,7 +573,14 @@ def evaluate_conversion(file_path):


 def evaluate_spacing(file_path):
-    document = Document(file_path)
+    if not file_path:
+        return 0
+
+    try:
+        document = Document(file_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Check line spacing for introduction, body, and conclusion
    introduction_spacing = document.paragraphs[0].paragraph_format.line_spacing
@@ -446,9 +593,18 @@ def evaluate_spacing(file_path):


 def check_italic_font_size_14(path1, path2):
+    if not path1 or not path2:
+        return 0
+
    if not compare_docx_files(path1, path2):
        return 0
-    document = Document(path1)
+
+    try:
+        document = Document(path1)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    for paragraph in document.paragraphs:
        for run in paragraph.runs:
            if run.italic:
@@ -459,8 +615,15 @@ def check_italic_font_size_14(path1, path2):


 def evaluate_alignment(docx_path):
+    if not docx_path:
+        return 0
+
    # Load the document
-    doc = Document(docx_path)
+    try:
+        doc = Document(docx_path)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0

    # Iterate through each paragraph in the document
    for para in doc.paragraphs:
@@ -488,7 +651,15 @@ def evaluate_alignment(docx_path):


 def get_unique_train_ids(initial_file):  # fixed standard
-    doc = Document(initial_file)
+    if not initial_file:
+        return set(), 0
+
+    try:
+        doc = Document(initial_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return set(), 0
+
    train_ids = set()
    processed_lines = 0

@@ -504,9 +675,18 @@ def get_unique_train_ids(initial_file):  # fixed standard


 def check_no_duplicates(initial_file, processed_file):
+    if not initial_file or not processed_file:
+        return 0
+
    # Open the document
    train_ids_ini, ini_lines = get_unique_train_ids(initial_file)
-    doc_processed = Document(processed_file)
+
+    try:
+        doc_processed = Document(processed_file)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
    train_ids_pro = set()
    processed_lines = 0  # Counter for valid lines processed

@@ -531,11 +711,18 @@ def check_no_duplicates(initial_file, processed_file):


 def compare_docx_lines(file1, file2):
-    # Read the text of the document, line by line
-    doc1 = Document(file1)
-    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
+    if not file1 or not file2:
+        return 0

-    doc2 = Document(file2)
+    # Read the text of the document, line by line
+    try:
+        doc1 = Document(file1)
+        doc2 = Document(file2)
+    except Exception as e:
+        logger.error(f"Error: {e}")
+        return 0
+
+    doc1_lines = [p.text.strip() for p in doc1.paragraphs if p.text.strip()]
    doc2_lines = [p.text.strip() for p in doc2.paragraphs if p.text.strip()]
    # print(doc1_lines)
    # print(doc2_lines)
@@ -547,8 +734,52 @@ def compare_docx_lines(file1, file2):
        return 0


+def compare_docx_files_and_ignore_new_lines(file1, file2, **options):
+    ignore_blanks = options.get('ignore_blanks', True)
+
+    if not file1 or not file2:
+        return 0
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
+        # First, delete all the blank in paragraphs
+        doc1 = [p for p in doc1.paragraphs if p.text != '']
+        doc2 = [p for p in doc2.paragraphs if p.text != '']
+        doc1_paragraphs = [p.text for p in doc1]
+        doc2_paragraphs = [p.text for p in doc2]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    # Process and compare documents
+    if ignore_blanks:
+        text1 = re.sub(r'\s+', ' ', '\n'.join(doc1_paragraphs)).strip()
+        text2 = re.sub(r'\s+', ' ', '\n'.join(doc2_paragraphs)).strip()
+        if text1 != text2:
+            return 0
+    else:
+        if len(doc1_paragraphs) != len(doc2_paragraphs):
+            return 0
+        # Compare each paragraph
+        for p1, p2 in zip(doc1_paragraphs, doc2_paragraphs):
+            if p1 != p2:
+                return 0
+    return 1
+
+
 # Docx file saved in the ubuntu cannot use this function to compare highlight, don't know why, deprecated
 def compare_highlighted_text(file1, file2):
+    if not file1 or not file2:
+        return 0
+
    def extract_highlighted_text(file_path):
        highlighted_texts = []

@@ -583,3 +814,58 @@ def compare_highlighted_text(file1, file2):
        return 1
    else:
        return 0
+
+
+def compare_references(file1, file2, **options):
+    if not file1 or not file2:
+        return 0
+
+    reference_indicator = options.get('reference_indicator', 'References')
+    reference_base_result = options.get('reference_base_result', 0.5)
+
+    # Determine file types and load documents
+    if file1.endswith('.docx') and file2.endswith('.docx'):
+        try:
+            doc1 = Document(file1)
+            doc2 = Document(file2)
+        except Exception as e:
+            logger.error(f"Error: {e}")
+            return 0
+
+        doc1_paragraphs = [p.text for p in doc1.paragraphs]
+        doc2_paragraphs = [p.text for p in doc2.paragraphs]
+    else:
+        # Unsupported file types or mismatch
+        print("Unsupported file types or mismatch between file types.")
+        return 0
+
+    # Find the references section in the paragraphs, find the idx of the last reference_indicator in the paragraph list
+    ref1_idx = doc1_paragraphs.index(reference_indicator) if reference_indicator in doc1_paragraphs else -1
+    ref2_idx = doc2_paragraphs.index(reference_indicator) if reference_indicator in doc2_paragraphs else -1
+
+    if ref1_idx == -1 and ref2_idx == -1:
+        return 1
+
+    if ref1_idx == -1 or ref2_idx == -1:
+        return 0
+
+    # split the reference section into reference items, and remove the empty string items
+    ref1 = [p for p in doc1_paragraphs[ref1_idx + 1:] if p.strip()]
+    ref2 = [p for p in doc2_paragraphs[ref2_idx + 1:] if p.strip()]
+
+    # Compare the references
+
+    if len(ref1) != len(ref2):
+        return 0
+
+    total_similarity = 0
+    for r1, r2 in zip(ref1, ref2):
+        # fuzzy match the references
+        similarity = fuzz.ratio(r1, r2) / 100.0
+        total_similarity += similarity
+
+    result = total_similarity / len(ref1)
+    if result >= reference_base_result:
+        return (result - reference_base_result) / (1 - reference_base_result)
+    else:
+        return 0
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -1,8 +1,11 @@
 import csv
+import datetime
+import difflib
 import functools
 import json
-import yaml
+import logging
 import operator
+import os
 import re
 import sqlite3
 from numbers import Number
@@ -10,17 +13,18 @@ from typing import Callable, Any, Union
 from typing import Dict, List, Pattern

 import lxml.etree
+import pdfplumber
+import yaml
+from docx import Document
 from lxml.cssselect import CSSSelector
 from lxml.etree import _Element
 from rapidfuzz import fuzz
-import difflib

 from .utils import _match_record, _match_value_to_rule

-import logging
-
 logger = logging.getLogger("desktopenv.metric.general")

+
 def check_include_exclude(result: str, rules: Dict[str, List[str]]) -> float:
    if result is None:
        return 0.
@@ -68,6 +72,7 @@ def is_in_list(result, rules) -> float:
    else:
        return 0.

+
 def diff_text_file(result: str, expect: str) -> float:
    if result is None:
        return 0.
@@ -78,12 +83,34 @@ def diff_text_file(result: str, expect: str) -> float:
        expected_lines: List[str] = f.read().splitlines()
    return difflib.SequenceMatcher(a=result_lines, b=expected_lines).ratio()

+
 def fuzzy_match(result, rules) -> float:
    expect = rules["expected"]

    return fuzz.ratio(result, expect) / 100.


+def fuzzy_place_math(result_file_path, rules) -> float:
+    if result_file_path is None:
+        return 0.
+    expect = rules["expected"]  # a list of possible answers
+    # read list.docx, and get all texts out, overlook blank lines, remove blanks before and after each line
+    doc = Document(result_file_path)
+    words_list = []
+    for para in doc.paragraphs:
+        words_list.extend(para.text.split())
+    fuzzy_score_list = []
+    for word in words_list:
+        max_score = 0
+        for ans in expect:
+            score = fuzz.ratio(word, ans) / 100
+            max_score = max(max_score, score)
+        fuzzy_score_list.append(max_score)
+    if len(fuzzy_score_list) != 3:
+        return 0.
+    return sum(fuzzy_score_list) / 3
+
+
 def check_csv(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
    """
    Args:
@@ -191,10 +218,10 @@ def check_accessibility_tree(result: str, rules: List[Dict[str, Any]]) -> float:
            return 0.

        if "text" in r:
-            match_func: Callable[[str], Number] = functools.partial( operator.eq if r["exact"] \
-                                                                               else (lambda a, b: fuzz.ratio(a, b) / 100.)
-                                                                   , r["text"]
-                                                                   )
+            match_func: Callable[[str], Number] = functools.partial(operator.eq if r["exact"] \
+                                                                        else (lambda a, b: fuzz.ratio(a, b) / 100.)
+                                                                    , r["text"]
+                                                                    )
            match_score: Number = 0
            for elm in elements:
                match_score = max(match_score, match_func(elm.text or None))
@@ -267,20 +294,193 @@ def check_json(result: str, rules: Dict[str, List[Dict[str, Union[List[str], str
    return float(metric)


-def check_direct_json_object(result, rules)->float:
+def check_direct_json_object(result, rules) -> float:
    """
    One of the most commonly used function to evalute.
    Compare two json objects directly.
    """
-    print("result: ")
-    print(result)
-    print("expected: ")
-    print(rules["expected"])
+    if isinstance(result, str):
+        # remove blanks before and after result
+        result = result.strip()
+        # replace all ' with "
+        result = result.replace("'", '"')
+        # load json object
+        result = json.loads(result)
    if result is None:
        return 0.
-    expected_json = rules["expected"]
-    for key in expected_json.keys():
-        expected_value = expected_json.get(key)
-        if expected_value != result.get(key):
-            return 0.
-    return 1.0
+    try:
+        expect_in_result = rules.get("expect_in_result", False)
+        if not expect_in_result:
+            expected_json = rules["expected"]
+            for key in expected_json.keys():
+                expected_value = expected_json.get(key)
+                if expected_value != result.get(key):
+                    return 0.
+            return 1.0
+        else:
+            expected_json = rules["expected"]
+
+            for key in expected_json.keys():
+                if isinstance(expected_json.get(key), List):
+                    flag = 0
+                    expected_value_list = expected_json.get(key)
+                    for each_expected_value in expected_value_list:
+                        if each_expected_value in result.get(key):
+                            flag = 1
+                            break
+                    if flag == 0:
+                        return 0.
+                elif isinstance(expected_json.get(key), str):
+                    if expected_json.get(key) not in result.get(key):
+                        return 0.
+                else:
+                    logger.debug("check_direct_json_object: expected value type not supported")
+                    return 0.
+            return 1.0
+    except:
+        logger.debug("check_direct_json_object: result is not a valid json object")
+        return 0.
+
+
+def compare_time_in_speedtest_results(speedtest_result_path, time_diff):
+    if not speedtest_result_path:
+        return 0
+
+    # open the speedtest results file(csv)
+    date_col = None
+    try:
+        with open(speedtest_result_path, 'r') as f:
+            for i, line in enumerate(f):
+                if i == 1:
+                    date = line.split(',')[1]
+                    break
+            now_date_time = datetime.datetime.now().strftime('%H:%M')
+            date_time = date[-5:]
+            # compare the date time with the current date time, if time diff less than time_diff para, then return true
+            if not abs((datetime.datetime.strptime(date_time, '%H:%M') - datetime.datetime.strptime(now_date_time,
+                                                                                                    '%H:%M')).total_seconds()) / 60 < int(
+                time_diff):
+                return 0
+        return 1
+    except:
+        logger.debug("compare_time_in_speedtest_results: file not found or not readable")
+        return 0
+
+
+def is_included_all_json_objects(gold_file_path, result_file_path):
+    if not gold_file_path or not result_file_path:
+        return 0
+
+    print("gold_file_path: ")
+    print(gold_file_path)
+    print("result_file_path: ")
+    print(result_file_path)
+    # two json file, check if all the key-value pair in gold_file_path is included in result_file_path
+    with open(gold_file_path, 'r') as f:
+        gold_json = json.load(f)
+    with open(result_file_path, 'r') as fr:
+        result_json = json.load(fr)
+    for key in gold_json.keys():
+        if key not in result_json.keys() or gold_json[key] != result_json[key]:
+            return 0
+    return 1
+
+
+def is_gold_text_included_in_pdf(pdf_file_path, gold_text_path):
+    if not gold_text_path or not pdf_file_path:
+        return 0
+
+    print("gold_text_path: ")
+    print(gold_text_path)
+    print("pdf_file_path: ")
+    print(pdf_file_path)
+    # gold file is a json file, we need to check all the value in json are included in pdf file.
+    with open(gold_text_path, 'r') as f:
+        gold_json = json.load(f)
+    with pdfplumber.open(pdf_file_path) as pdf:
+        text = ''
+        for page in pdf.pages:
+            text += page.extract_text()
+    false_list = []
+    for key in gold_json.keys():
+        if gold_json[key] not in text:
+            false_list.append(key)
+    if len(false_list) > 0:
+        print("false_list: ")
+        print(false_list)
+        return 0
+    else:
+        return 1
+
+
+def file_contains(file_path, config):
+    # file_path ends with .txt
+    if not file_path:
+        return 0.
+    try:
+        with open(file_path, 'r') as f:
+            file_text = f.read()
+        for text in config["expected"]:
+            if text not in file_text:
+                logger.debug(f"file_contains: {text} not found in {file_path}")
+                return 0.
+    except:
+        logger.debug("file_contains: file not found or not readable")
+        return 0.
+    return 1.
+
+
+def check_line_number(file_path, line_number):
+    # check if file_path exists
+    if file_path is None or not os.path.isfile(file_path):
+        return 0.
+    timeRegex = "([01]\d|2[0-3]):[0-5]\d:[0-5]\d"
+    # check if the string that matches the timeRegex in this txt file equals to line_number["expected"]
+    try:
+        with open(file_path, 'r') as f:
+            line_count = 0
+            for line in f:
+                if re.search(timeRegex, line):
+                    line_count += 1
+        # if line_count equals to line_number["expected"], return 1, else return 0
+        return 1 if line_count == int(line_number["expected"]) else 0
+    except:
+        logger.debug("check_line_number: file not found or not readable")
+        return 0.
+
+
+def compare_terminal_and_txt(txt_file_path, terminal_output):
+    if not txt_file_path or not terminal_output:
+        return 0
+
+    # read txt file content
+    with open(txt_file_path, 'r') as f:
+        txt_file_content = f.read()
+    # compare terminal output with txt file content
+    return 1 if terminal_output == txt_file_content else 0
+
+
+def compare_python_pure_text(py_file_path, gold_file_path):
+    if not py_file_path or not gold_file_path:
+        return 0
+
+    # first, change the suffix of gold_file from .txt to .py
+    print("py_file_path: ")
+    print(py_file_path)
+    print("gold_file_path: ")
+    print(gold_file_path)
+
+    # gold_file_path = gold_file_path.replace('.txt', '.py')
+    def remove_whitespace(text):
+        return ''.join(text.split())
+
+    with open(py_file_path, 'r') as file1:
+        content1 = file1.read()
+    with open(gold_file_path, 'r') as file2:
+        content2 = file2.read()
+    content1_no_whitespace = remove_whitespace(content1)
+    content2_no_whitespace = remove_whitespace(content2)
+    if content1_no_whitespace == content2_no_whitespace:
+        return 1
+    else:
+        return 0
--- a/desktop_env/evaluators/metrics/gimp.py
+++ b/desktop_env/evaluators/metrics/gimp.py
@@ -5,7 +5,7 @@ from PIL import Image, ImageChops, ImageStat


 def compare_image_list(pred_img_path_list: Union[str, List[str]],
-                   gold_img_path_list: Union[str, List[str]]) -> float:
+                       gold_img_path_list: Union[str, List[str]]) -> float:
    """ Compare two image lists, only if all images are the same, return 1.0, otherwise return 0.0
    """
    if type(pred_img_path_list) != list:
@@ -177,6 +177,16 @@ def calculate_contrast(image):
    return np.std(pixels)


+def calculate_image_sharpness(image_path):
+    # Load the image in grayscale
+    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
+    # Apply the Laplacian operator
+    laplacian = cv2.Laplacian(image, cv2.CV_64F)
+    # Calculate the variance
+    variance = np.var(laplacian)
+    return variance
+
+
 def structure_check_by_mse(img1, img2, threshold=0.03):
    """Check if two images are approximately the same by MSE"""
    mse = np.mean(
@@ -189,7 +199,7 @@ def structure_check_by_mse(img1, img2, threshold=0.03):

 def structure_check_by_ssim(img1, img2, threshold=0.9):
    """Check if two images are approximately the same by SSIM"""
-    similarity = ssim(np.array(img1), np.array(img2), multichannel=True)
+    similarity = ssim(np.array(img1), np.array(img2), multichannel=True, channel_axis=-1)
    print("SSIM: ", similarity)
    return similarity >= threshold

@@ -295,7 +305,8 @@ def check_triangle_position(tgt_path):

    # We assume the triangle is a different color from the background
    # Find the unique colors
-    unique_colors, counts = np.unique(img_array.reshape(-1, img_array.shape[2]), axis=0, return_counts=True)
+    unique_colors, counts = np.unique(img_array.reshape(-1, img_array.shape[2]), axis=0,
+                                      return_counts=True)
    unique_colors_sorted = unique_colors[np.argsort(counts)]

    # Assuming the background is the most common color and the triangle is a different color
@@ -337,6 +348,25 @@ def check_structure_sim(src_path, tgt_path):
    return structure_same


+def check_structure_sim_resized(src_path, tgt_path):
+    """
+    Check if the structure of the two images are similar after resizing.
+    gimp:d16c99dc-2a1e-46f2-b350-d97c86c85c15
+    """
+    if src_path is None or tgt_path is None:
+        return 0.
+
+    img_src = Image.open(src_path)
+    img_tgt = Image.open(tgt_path)
+
+    # Resize the images to the same size
+    img_src = img_src.resize(img_tgt.size)
+
+    # Check if the structure is similar
+    structure_same = structure_check_by_ssim(img_src, img_tgt)
+    return structure_same
+
+
 def check_contrast_increase_and_structure_sim(src_path, tgt_path):
    """
    Check if the src image has higher contrast than the tgt image and the structures are similar
@@ -388,34 +418,28 @@ def check_config_status(actual_config_path, rule):
    return 0.


-def check_image_size_and_structure_sim(src_path, tgt_path, height=512, width=None):
+def check_image_size(src_path, rule):
    """
-    Check if the size of the src image is correct and the structure of the two images are similar.
-    gimp:d16c99dc-2a1e-46f2-b350-d97c86c85c15
+    Check if the size of the src image is correct
+    multi-apps:42f4d1c7-4521-4161-b646-0a8934e36081
    """
-
-    if src_path is None or tgt_path is None:
+    if src_path is None:
        return 0.

-    # Load images
-    source_image = Image.open(src_path)
-    target_image = Image.open(tgt_path)
+    # Load the image
+    img = Image.open(src_path)

-    # Check size
-    if width is not None:
-        width_same = source_image.size[0] == width
-    else:
-        width_same = True
-    if height is not None:
-        height_same = source_image.size[1] == height
+    # Check the size
+    if rule["height"] is not None:
+        height_same = img.size[1] == rule["height"]
    else:
        height_same = True
+    if rule["width"] is not None:
+        width_same = img.size[0] == rule["width"]
+    else:
+        width_same = True

-    # Check structure
-    resized_target_image = target_image.resize(source_image.size)
-    structure_same = structure_check_by_ssim(source_image, resized_target_image)
-
-    if width_same and height_same and structure_same:
+    if height_same and width_same:
        return 1.
    else:
        return 0.
@@ -521,32 +545,26 @@ def check_green_background(src_path, tgt_path):
    return 1.


-if __name__ == "__main__":
-    actual_config_path = "../../../cache/sessionrc_test"
-    rule = {
-        "key": "hide-docks",
-        "value": "no"
-    }
-    print(check_config_status(actual_config_path, rule))
+def check_sharper(src_path, tgt_path):
+    """
+    Check if the source image is sharper than the target image.
+    multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
+    """
+    sharpness_src = calculate_image_sharpness(src_path)
+    sharpness_tgt = calculate_image_sharpness(tgt_path)
+    return 1.0 if sharpness_src > sharpness_tgt else 0.0

-    actual_config_path = "../../../cache/action-history_test"
-    rule = {
-        "key": ["history-item", "\"filters-vignette\""],
-        "value": "1"
-    }
-    print(check_config_status(actual_config_path, rule))

-    actual_config_path = "../../../cache/gimprc_test"
-    rule = {
-        "key": "undo-levels",
-        "value": "100"
-    }
-    print(check_config_status(actual_config_path, rule))
-
-    src_path = "../../../cache/734d6579-c07d-47a8-9ae2-13339795476b/green_background_with_object.png"
-    tgt_path = "../../../cache/734d6579-c07d-47a8-9ae2-13339795476b/white_background_with_object.png"
-    print(check_green_background(src_path, tgt_path))
-
-    tgt_path = "../../../cache/f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce/Triangle_In_The_Middle.png"
-    print(check_triangle_position(tgt_path))
+def check_image_file_size(src_path, rule):
+    """
+    Check if the size of the src image within 500KB
+    """
+    if src_path is None:
+        return 0.0

+    # Check the size
+    file_size = os.path.getsize(src_path)
+    if file_size < rule["max_size"]:
+        return 1.0
+    else:
+        return 0.0
--- a/desktop_env/evaluators/metrics/libreoffice.py
+++ b/desktop_env/evaluators/metrics/libreoffice.py
@@ -26,13 +26,3 @@ def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
                     for ptn in rules["locale_set"]
                     )
                 )
-
-
-if __name__ == "__main__":
-    path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu"
-    print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*"
-        , "pt-*", "es-*", "it-*"
-                                                    ]
-                                     }
-                             )
-          )
--- a/desktop_env/evaluators/metrics/others.py
+++ b/desktop_env/evaluators/metrics/others.py
@@ -1,20 +1,20 @@
-import zipfile
-import os.path
+import logging
 import os
+import os.path
+import zipfile
+from typing import List, Dict
+from typing import Union, TypeVar

 import lxml.html
 from lxml.html import HtmlElement
-from typing import List, Dict
-from typing import Union, TypeVar
 from mutagen.easyid3 import EasyID3

 from .general import diff_text_file
 from .utils import _match_value_to_rule

-import logging
-
 logger = logging.getLogger("desktopenv.metric.others")

+
 def process_epub(filename: str) -> List[str]:
    file_list: List[str] = []

@@ -23,7 +23,7 @@ def process_epub(filename: str) -> List[str]:

    try:
        with zipfile.ZipFile(filename, "r") as z_f:
-            with z_f.open("toc.ncx") as in_f\
+            with z_f.open("toc.ncx") as in_f \
                    , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
                contents: str = in_f.read().decode()
                contents = contents.splitlines()
@@ -31,7 +31,7 @@ def process_epub(filename: str) -> List[str]:
                    if "navPoint" not in l:
                        out_f.write(l + "\n")
            file_list.append(os.path.join(base_dir, "toc.ncx"))
-            with z_f.open("content.opf") as in_f\
+            with z_f.open("content.opf") as in_f \
                    , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
                contents: str = in_f.read().decode()
                contents = contents.splitlines()
@@ -41,14 +41,14 @@ def process_epub(filename: str) -> List[str]:
            file_list.append(os.path.join(base_dir, "content.opf"))
            for f_n in z_f.namelist():
                if f_n.endswith(".html"):
-                    with z_f.open(f_n) as in_f\
+                    with z_f.open(f_n) as in_f \
                            , open(os.path.join(base_dir, f_n), "w") as out_f:
                        html: HtmlElement = lxml.html.fromstring(
-                                                ''.join( filter( lambda ch: ch!="\n" and ch!="\r"
-                                                               , in_f.read().decode()
-                                                               )
-                                                       ).encode()
-                                              )
+                            ''.join(filter(lambda ch: ch != "\n" and ch != "\r"
+                                           , in_f.read().decode()
+                                           )
+                                    ).encode()
+                        )
                        out_f.write(lxml.html.tostring(html, pretty_print=True, encoding="unicode"))
                    file_list.append(os.path.join(base_dir, f_n))
        logger.debug("%s: %s", filename, file_list)
@@ -56,6 +56,7 @@ def process_epub(filename: str) -> List[str]:
    except zipfile.BadZipFile:
        return []

+
 def compare_epub(result: str, expected: str) -> float:
    if result is None:
        return 0.
@@ -69,8 +70,10 @@ def compare_epub(result: str, expected: str) -> float:
        metric *= current_metric
    return metric

+
 V = TypeVar("Value")

+
 def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bool:
    # checks using _match_value_to_rule
    if result is None:
@@ -85,44 +88,3 @@ def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bo
        logger.debug("%s.%s: %s", result, k, value)
        metric = metric and _match_value_to_rule(value, r)
    return float(metric)
-
-if __name__ == "__main__":
-    import datetime
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    metric = check_mp3_meta( "snapshots/test/cache/3f05f3b9-29ba-4b6b-95aa-2204697ffc06/Cheng Xiang - Missing You - gt.mp3"
-                           , { "title": { "method": "eq"
-                                        , "ref": "Missing You"
-                                        }
-                             , "artist": { "method": "eq"
-                                         , "ref": "Cheng Xiang"
-                                         }
-                             }
-                           )
-    print(metric)
--- a/desktop_env/evaluators/metrics/pdf.py
+++ b/desktop_env/evaluators/metrics/pdf.py
@@ -2,6 +2,7 @@ import operator
 from typing import Any
 from typing import Dict

+import fitz  # PyMuPDF
 from pypdf import PdfReader


@@ -11,3 +12,20 @@ def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
    reader = PdfReader(pdf_file)
    nb_pages: int = len(reader.pages)
    return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
+
+
+def extract_answers_from_pdf(pdf_file):
+    doc = fitz.open(pdf_file)
+    answers = []
+
+    for page in doc:
+        text = page.get_text()
+        lines = text.split('\n')
+        for line in lines:
+            if line.strip():
+                parts = line.split('=')
+                if len(parts) > 1:
+                    answer = parts[-1].strip()
+                    answers.append(answer)
+
+    return answers
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -165,23 +165,24 @@ def compare_pptx_files(file1_path, file2_path, **options):
    # compare the content of each slide
    for slide1, slide2 in zip(prs1.slides, prs2.slides):
        slide_idx += 1
+
        def get_slide_background_color(slide):
            background = slide.background
            if background.fill.background():
                return background.fill.fore_color.rgb
            else:
                return None
-            
+
        if get_slide_background_color(slide1) != get_slide_background_color(slide2) and examine_background_color:
            return 0
-        
+
        def get_slide_notes(slide):
            notes_slide = slide.notes_slide
            if notes_slide:
                return notes_slide.notes_text_frame.text
            else:
                return None
-        
+
        if get_slide_notes(slide1).strip() != get_slide_notes(slide2).strip() and examine_note:
            return 0
        # check if the shapes are the same
@@ -192,14 +193,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_table_bottom_position:
                if slide_idx == 3 and shape1.shape_type == 19 and shape2.shape_type == 19:
                    if shape1.top <= shape2.top or shape1.top < 3600000:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-                    
+
            if examine_right_position:
                if slide_idx == 2 and not hasattr(shape1, "text") and not hasattr(shape2, "text"):
                    if shape1.left <= shape2.left or shape1.left < 4320000:
@@ -207,28 +208,31 @@ def compare_pptx_files(file1_path, file2_path, **options):

            if examine_top_position:
                if slide_idx == 2 and shape1.shape_type == 13 and shape2.shape_type == 13:
-                        if shape1.top >= shape2.top or shape1.top > 1980000:
-                            return 0
+                    if shape1.top >= shape2.top or shape1.top > 1980000:
+                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_shape_for_shift_size:
                if shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
-                    if not (hasattr(shape1, "text") and hasattr(shape2, "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
+                    if not (hasattr(shape1, "text") and hasattr(shape2,
+                                                                "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
                        return 0
-                                
-            if (shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
+
+            if (
+                    shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
                return 0
-            
+
            if examine_image_size:
                if shape1.shape_type == 13 and shape2.shape_type == 13:
                    if shape1.width != shape2.width or shape1.height != shape2.height:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_modify_height:
-                if not hasattr(shape1, "text") and not hasattr(shape2, "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
+                if not hasattr(shape1, "text") and not hasattr(shape2,
+                                                               "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
                    if shape1.height != shape2.height:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
@@ -236,13 +240,13 @@ def compare_pptx_files(file1_path, file2_path, **options):

            if hasattr(shape1, "text") and hasattr(shape2, "text"):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
-                    return 0    
-                
-                # check if the paragraphs are the same
+                    return 0
+
+                    # check if the paragraphs are the same
                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
                    if para1.alignment != para2.alignment and examine_alignment:
                        return 0
-                        
+
                    # check if the runs are the same
                    if para1.text != para2.text and examine_text:
                        return 0
@@ -253,7 +257,7 @@ def compare_pptx_files(file1_path, file2_path, **options):
                    for run1, run2 in zip(para1.runs, para2.runs):

                        # check if the font properties are the same                        
-                        if run1.font.name != run2.font.name and examine_font_name:                            
+                        if run1.font.name != run2.font.name and examine_font_name:
                            return 0

                        if run1.font.size != run2.font.size and examine_font_size:
@@ -305,10 +309,9 @@ def compare_pptx_files(file1_path, file2_path, **options):

                            return bullets

-                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(run2.part.blob.decode('utf-8')):
+                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(
+                                run2.part.blob.decode('utf-8')):
                            return 0
-                        
-

                    # fixme: Actually there are more properties to be compared, we can add them later via parsing the xml data

@@ -524,15 +527,3 @@ def check_auto_saving_time(pptx_file, rules):
        logger.error(f"Error parsing XML: {e}")
    except FileNotFoundError:
        logger.error(f"File not found: {pptx_file}")
-
-
-if __name__ == '__main__':
-    # print(compare_pptx_files(
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx",
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx"))
-    # print(evaluate_presentation_fill_to_rgb_distance(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\3b27600c-3668-4abd-8f84-7bcdebbccbdb\lec17-gui-events.pptx", {"rgb": (0, 0, 255)}))
-    # print(check_auto_saving_time(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\2cd43775-7085-45d8-89fa-9e35c0a915cf\registrymodifications.xcu", {"minutes": 3}))
-    print(compare_pptx_files(
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6_Gold.pptx",
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6.pptx",
-        examine_shape=False))
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -11,16 +11,16 @@ import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.cell.cell import Cell
-from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.worksheet.worksheet import Worksheet
-
-from .utils import _match_value_to_rule, _read_cell_style, read_cell_value
-from .utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles\
-                 , load_filters, load_pivot_tables
 from rapidfuzz import fuzz

+from desktop_env.evaluators.metrics.utils import _match_value_to_rule, _read_cell_style, read_cell_value
+from desktop_env.evaluators.metrics.utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles \
+    , load_filters, load_pivot_tables
+
 # from openpyxl.utils import coordinate_to_tuple

 logger = logging.getLogger("desktopenv.metric.table")
@@ -165,7 +165,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            logger.debug("Sheet1: \n%s", str(sheet1))
            logger.debug("Sheet2: \n%s", str(sheet2))
            try:
-                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1==sheet2))
+                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1 == sheet2))
            except:
                logger.debug("Sheet1 =/v= Sheet2")
            logger.debug("Assertion: %s =v= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
@@ -231,14 +231,14 @@ def compare_table(result: str, expected: str = None, **options) -> float:
                            value1 = value1.lower()
                            value2 = value2.lower()

-                        if rl["type"]=="includes":
+                        if rl["type"] == "includes":
                            metric: bool = value2 in value1
-                        elif rl["type"]=="included_by":
+                        elif rl["type"] == "included_by":
                            metric: bool = value1 in value2
-                        elif rl["type"]=="fuzzy_match":
+                        elif rl["type"] == "fuzzy_match":
                            metric: bool = fuzz.ratio(value1, value2) >= rl.get("threshold", 85.)
-                        elif rl["type"]=="exact_match":
-                            metric: bool = value1==value2
+                        elif rl["type"] == "exact_match":
+                            metric: bool = value1 == value2
                        total_metric = total_metric and metric

            metric: bool = total_metric
@@ -409,7 +409,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:

            filters1: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
            filters2: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = filters1==filters2
+            metric: bool = filters1 == filters2
            logger.debug("Assertion: %s[filter] == %s[filter] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Filters # 

@@ -421,7 +421,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:

            pivots1: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
            pivots2: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = pivots1==pivots2
+            metric: bool = pivots1 == pivots2
            logger.debug("Assertion: %s[pivot]==%s[pivot] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Pivot Tables # 

@@ -482,81 +482,36 @@ def compare_csv(result: str, expected: str, **options) -> float:
    return float(metric)


-if __name__ == '__main__':
-    import datetime
-    import sys
+def compare_conference_city_in_order(actual_city_list_path, expected_city):
+    expected_city_list = expected_city["expected"]
+    wb = openpyxl.load_workbook(actual_city_list_path)
+    sheet = wb.active
+    actual_city_list = []
+    for row in sheet["C2:C22"]:
+        for cell in row:
+            actual_city_list.append(cell.value)
+    # expected_city is the city that we want to compare with the actual city list
+    # must in order index
+    # debug
+    try:
+        for i in range(len(actual_city_list)):
+            if isinstance(expected_city_list[i], str):
+                if expected_city_list[i] not in actual_city_list[i]:
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.

-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)

-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            elif isinstance(expected_city_list[i], List):
+                if not any(possible_str in actual_city_list[i] for possible_str in expected_city_list[i]):
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.

-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
+            else:
+                raise TypeError("Expected city should be a string or a list of strings")

-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
+    except:
+        return 0.

-    formatter = logging.Formatter(
-        fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    path1 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday.xlsx"
-    path2 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday_gold.xlsx"
-    rules = [ { "type": "sheet_data"
-              , "sheet_idx0": 0
-              , "sheet_idx1": "EI0"
-              }
-            ]
-    print(compare_table(path1, path2
-                        , rules=rules
-                        )
-          )
-    print(compare_table(path2, path2
-                        , rules=rules
-                        )
-          )
-
-    # Row Properties
-    # path1 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA_gold.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path1)
-    # worksheet: Worksheet = workbook.active
-    # for r_no, dms in worksheet.column_dimensions.items():
-    # print(r_no, type(r_no), type(dms), dms.hidden)
-
-    # Conditional Formats
-    # import formulas
-    # path1 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold.xlsx"
-    # path3 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold_test.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path2)
-    # worksheet: Worksheet = workbook.active
-    # print(worksheet.conditional_formatting)
-    # for itm in worksheet.conditional_formatting:
-    # print(itm.cells)
-    # for r in itm.rules:
-    # print( r.type, r.formula, r.dxf.font.color.rgb
-    # , r.dxf.fill.fgColor.rgb, r.dxf.fill.bgColor.rgb
-    # )
-    # condition = formulas.Parser().ast("=" + r.formula[0])[1].compile()
-    ##print(r.type, r.operator, r.dxfId, r.dxf)
-    # for r in itm.cells:
-    # for c in r.cells:
-    # value = worksheet.cell(row=c[0], column=c[1]).value
-    # print(value, condition(str(value)))
+    return 1.
--- a/desktop_env/evaluators/metrics/thunderbird.py
+++ b/desktop_env/evaluators/metrics/thunderbird.py
@@ -1,17 +1,19 @@
+import json
+import logging
+import re
 from typing import List, Pattern, Dict, Match
 from typing import Union, Any, TypeVar, Callable

-import re
-import json
 from .utils import _match_record
 from .utils import _match_value_to_rule as _match_pref

-import logging
 logger = logging.getLogger("desktopenv.metric.thunderbird")

 V = TypeVar("Value")

 _pref_pattern: Pattern[str] = re.compile(r'^user_pref\("(?P<key>(?:[^"]|\\")+)\", (?P<val>.+)\);$');
+
+
 def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any]]]):
    """
    Args:
@@ -51,10 +53,10 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any
                continue

            key: str = match_.group("key")
-            #value: str = match_.group("val")
-            #if value in {"true", "false"}:
-                #value = value.title()
-            #value: V = eval(value)
+            # value: str = match_.group("val")
+            # if value in {"true", "false"}:
+            # value = value.title()
+            # value: V = eval(value)
            value = json.loads(match_.group("val"))
            if key in expect_rules:
                logger.debug("K: %s, V: %s", key, repr(value))
@@ -64,9 +66,13 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any

    return float(all(expect_metrics.values()) and unexpect_metric)

+
 _value_processor: Callable[[str], str] = lambda val: val.replace("\\\"", "\"").replace("\\\\", "\\")
-#_condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
-_condition_pattern: Pattern[str] = re.compile(r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+# _condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
+_condition_pattern: Pattern[str] = re.compile(
+    r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+
+
 def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
    """
    Args:
@@ -112,8 +118,8 @@ def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]
                condition_str: str = _value_processor(l[11:-2])
                logger.debug("FILTER CONDITION: %s", condition_str)

-                conditions: List[str] =\
-                        _condition_pattern.findall(condition_str)
+                conditions: List[str] = \
+                    _condition_pattern.findall(condition_str)
                logger.debug("FILTER CONDITIONS: %s", repr(conditions))

                filter_["condition"] = conditions
@@ -138,6 +144,7 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
        remove_deleted (bool): ignore deleted messages which has status code 0008 or 0009. default: True
        remove_duplicate (bool): remove duplicate messages. default: True
    """
+
    def normalize_msg(msg, options):
        ignore_status = options.get('ignore_status', False)
        ignore_keys = options.get('ignore_keys', False)
@@ -167,66 +174,3 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
        mail2 = read_thunderbird_folder_file(gold)
        if mail1 != mail2: return .0
    return 1.0
-
-
-if __name__ == "__main__":
-    #import lxml.etree
-    #from lxml.cssselect import CSSSelector
-    #from lxml.etree import _Element
-
-    #xml = "../../任务数据/Thunderbird/vertical-card-view.xml"
-    #xml = "../../任务数据/Thunderbird/vertical-table-view.xml"
-    #at: _Element = lxml.etree.parse(xml)
-
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] page-tab-list')(at) # page tab tags
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section:nth-child(3)')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>section[attr|class~="tree-table-header"]>table-row>column-header[name=Subject]>push-button', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>tree>tree-item>section[name="Subject"]>section>section', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #print(len(elements))
-    #for elm in elements:
-        #print(lxml.etree.tostring(elm, encoding="unicode", pretty_print=True))
-
-    import datetime
-    import os
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    print( check_thunderbird_filter( "../../任务数据/Thunderbird/msgFilterRules.dat"
-                                   , { "expect": [ { "enabled": "yes"
-                                                   , "action": "Move to folder"
-                                                   , "actionValue": "mailbox://nobody@Local%20Folders/Promotions"
-                                                   , "condition": ["AND (subject,contains,discount)"]
-                                                   }
-                                                 ]
-                                     }
-                                   )
-        )
--- a/desktop_env/evaluators/metrics/utils.py
+++ b/desktop_env/evaluators/metrics/utils.py
@@ -1,10 +1,12 @@
 import builtins
+import datetime
 import functools
 import itertools
 import logging
 import operator
 import re
 import zipfile
+import pandas as pd
 from typing import Any, TypeVar, Union, Iterable, Optional, Callable
 from typing import Dict, List, Set, Match, Tuple, Pattern
 from urllib.parse import urlparse, urlunparse
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -2,6 +2,7 @@ import copy
 import importlib.util
 import json
 import sys
+import re
 from typing import Dict


@@ -86,11 +87,44 @@ def compare_text_file(actual: str, expected: str, **options) -> float:
    with open(expected) as f2:
        expected_text = f2.read()

+    ignore_blanks = options.get('ignore_blanks', False)
+    if ignore_blanks:
+        actual_text = re.sub(r'[\t\n]', ' ', actual_text).strip()
+        actual_text = re.sub(r'\s+', ' ', actual_text)
+        expected_text = re.sub(r'[\t\n]', ' ', expected_text).strip()
+        expected_text = re.sub(r'\s+', ' ', expected_text)
+
+    ignore_case = options.get('ignore_case', False)
+    if ignore_case:
+        actual_text = actual_text.lower()
+        expected_text = expected_text.lower()
+
    if actual_text == expected_text:
        return 1.0
    return 0.0

 import zipfile
+from difflib import SequenceMatcher
+import PyPDF2
+
+def compare_pdf_content(content1, content2, text_similarity_threshold):
+    def extract_text_from_pdf(content):
+        with open("temp.pdf", "wb") as temp_pdf:
+            temp_pdf.write(content)
+        with open("temp.pdf", "rb") as temp_pdf:
+            pdf_reader = PyPDF2.PdfReader(temp_pdf)
+            text = ''
+            for page_num in range(len(pdf_reader.pages)):
+                page = pdf_reader.pages[page_num]
+                text += page.extract_text()
+        return text
+
+    text1 = extract_text_from_pdf(content1)
+    text2 = extract_text_from_pdf(content2)
+
+    similarity_ratio = SequenceMatcher(None, text1, text2).ratio()
+
+    return similarity_ratio >= text_similarity_threshold

 def compare_zip_files(actual: str, expected: str, **options) -> float:
    """
@@ -115,7 +149,12 @@ def compare_zip_files(actual: str, expected: str, **options) -> float:
            content1 = zip_file1.read(file_name)
            content2 = zip_file2.read(file_name)

-            if content1 != content2:
+            if file_name.lower().endswith('.pdf'):
+                if compare_pdf_content(content1, content2, 0.95):
+                    continue
+                else:
+                    return 0.0
+            elif content1 != content2:
                return 0.0
    return 1.0

@@ -190,3 +229,45 @@ def check_python_file_by_test_suite(actual_files, test_file, **options) -> float

 def check_python_file_by_gold_file(actual_files, gold_file: str, **options) -> float:
    pass
+
+
+def check_html_background_image(src_path: str, rule: Dict = None) -> float:
+    """
+    Check if the background image is correctly set.
+    multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
+    """
+    from bs4 import BeautifulSoup
+    with open(src_path, 'r') as f:
+        html_content = f.read()
+    soup = BeautifulSoup(html_content, 'html.parser')
+    styles = soup.find_all('style')
+    for style in styles:
+        if f'background-image: url(\'{rule["value"]}\')' in style.text:
+            return 1.0
+    return 0.0
+
+
+def compare_result_files(src_path, tgt_path):
+    """
+    Compare whether the content of two files are the same.
+    multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85
+    """
+    with open(src_path, 'r') as f:
+        src_content = f.read().strip()
+    with open(tgt_path, 'r') as f:
+        tgt_content = f.read().strip()
+    try:
+        # Compare the content as numbers
+        tgt_content_num = float(tgt_content)
+        if tgt_content in src_content:
+            # If the content of tgt is in src, return 1.0 since output src might be
+            # a superset(language description+number) of tgt
+            return 1.0
+        src_content_num = float(src_content)
+        if abs(src_content_num - tgt_content_num) < 1e-4:
+            return 1.0
+        return 0.0
+    except:
+        if src_content == tgt_content:
+            return 1.0
+    return 0.0
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -117,7 +117,7 @@ def launch_app():
 def capture_screen_with_cursor():
    # fixme: when running on virtual machines, the cursor is not captured, don't know why

-    file_path = os.path.join("screenshots", "screenshot.png")
+    file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
    user_platform = platform.system()

    # Ensure the screenshots directory exists
--- a/evaluation_examples/examples/chrome/0d8b7de3-e8de-4d86-b9fd-dd2dce58a217.json
+++ b/evaluation_examples/examples/chrome/0d8b7de3-e8de-4d86-b9fd-dd2dce58a217.json
@@ -53,17 +53,32 @@
    "chrome"
  ],
  "evaluator": {
-    "func": "is_expected_active_tab",
-    "result": {
+    "func": ["is_expected_active_tab", "is_expected_active_tab"],
+    "conj": "or",
+    "result": [
+      {
      "type": "active_url_from_accessTree",
      "goto_prefix": "https://www."
-    },
-    "expected": {
+      },
+      {
+        "type": "active_url_from_accessTree",
+        "goto_prefix": "https://www."     
+      }
+    ],
+    "expected": [
+      {
      "type": "rule",
      "rules": {
        "type": "url",
        "url": "https://www.drugs.com/npc/"
      }
-    }
+      },
+      {
+        "type": "rule",
+        "rules": {
+          "type": "url",
+          "url": "https://www.drugs.com/npp/"
+      }
+      }]
  }
 }
--- a/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json
+++ b/evaluation_examples/examples/chrome/1704f00f-79e6-43a7-961b-cedd3724d5fd.json
@@ -62,8 +62,8 @@
            "type": "rule",
            "rules":{
                "expected": {
-                    "locationName": "Zurich Airport",
-                    "dropLocationName": "Zurich Airport",
+                    "locationName": "Zürich",
+                    "dropLocationName": "Zürich",
                    "filterCriteria_carCategory": "large",
                    "filterCriteria_sortBy": "PRICE"
                }
--- a/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json
+++ b/evaluation_examples/examples/chrome/2ad9387a-65d8-4e33-ad5b-7580065a27ca.json
@@ -1,7 +1,7 @@
 {
  "id": "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
  "snapshot": "chrome",
-  "instruction": "Can you make a new folder for me on that bookmarks bar in my internet browser? Let's call it 'Favorites.'",
+  "instruction": "Can you make a new folder for me on the bookmarks bar in my internet browser? Let's call it 'Favorites.'",
  "source": "https://www.youtube.com/watch?v=IN-Eq_UripQ",
  "config": [
    {
--- a/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json
+++ b/evaluation_examples/examples/chrome/7f52cab9-535c-4835-ac8c-391ee64dc930.json
@@ -54,7 +54,7 @@
            "rules":{
                "expected": {
                    "q": "drip coffee maker",
-                    "tbs": "mr:1,price:1,ppr_min:25,ppr_max:60,pdtr0:1825161|1825162"
+                    "tbs": "mr:1,price:1,ppr_min:25,ppr_max:60,sales:1,pdtr0:1825161|1825162"
                }
            }
        }
--- a/evaluation_examples/examples/chrome/9656a811-9b5b-4ddf-99c7-5117bcef0626.json
+++ b/evaluation_examples/examples/chrome/9656a811-9b5b-4ddf-99c7-5117bcef0626.json
@@ -29,6 +29,15 @@
    "chrome"
  ],
  "evaluator": {
+    "postconfig": [
+      {
+        "type": "execute",
+        "parameters": {
+          "command": "pkill chrome",
+          "shell": "true"
+        }
+      }
+    ],
    "func": "exact_match",
    "result": {
      "type": "enable_enhanced_safety_browsing"
--- a/evaluation_examples/examples/chrome/99146c54-4f37-4ab8-9327-5f3291665e1e.json
+++ b/evaluation_examples/examples/chrome/99146c54-4f37-4ab8-9327-5f3291665e1e.json
@@ -1,7 +1,7 @@
 {
  "id": "99146c54-4f37-4ab8-9327-5f3291665e1e",
  "snapshot": "chrome",
-  "instruction": "Please help me set Chrome to delete my browsing history automatically every time I close the browser.",
+  "instruction": "Please help me set Chrome to delete my browsing data automatically every time I close the browser.",
  "source": "https://www.youtube.com/watch?v=v0kxqB7Xa6I",
  "config": [
    {
@@ -29,6 +29,13 @@
    "chrome"
  ],
  "evaluator": {
+    "postconfig":[{
+      "type": "execute",
+      "parameters": {
+        "command": "pkill chrome",
+        "shell": "true"
+      }
+    }],
    "func": "exact_match",
    "result": {
      "type": "data_delete_automacally"
@@ -36,7 +43,7 @@
    "expected": {
      "type": "rule",
      "rules": {
-        "expected": "Crashed"
+        "expected": "true"
      }
    }
  }
--- a/evaluation_examples/examples/chrome/b070486d-e161-459b-aa2b-ef442d973b92.json
+++ b/evaluation_examples/examples/chrome/b070486d-e161-459b-aa2b-ef442d973b92.json
@@ -43,19 +43,35 @@
    "chrome"
  ],
  "evaluator": {
-      "func": "exact_match",
-      "result": {
+      "func": ["exact_match", "exact_match"],
+      "conj": "or",
+      "result": [
+        {
        "type": "url_dashPart",
        "goto_prefix": "https://www.",
        "partIndex": -1,
        "needDeleteId": false,
        "returnType": "string"
-      },
-      "expected": {
+        },
+        {
+          "type": "url_dashPart",
+          "goto_prefix": "https://www.",
+          "partIndex": -1,
+          "needDeleteId": false,
+          "returnType": "string"
+        }],
+      "expected": [
+      {
        "type": "rule",
        "rules": {
          "expected": "tamiflu.html#side-effects"
        }
-      }
+      },
+      {
+        "type": "rule",
+        "rules": {
+          "expected": "tamiflu-side-effects.html"
+        }
+      }]
    }
 }
--- a/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json
+++ b/evaluation_examples/examples/chrome/cabb3bae-cccb-41bd-9f5d-0f3a9fecd825.json
@@ -53,7 +53,7 @@
        "rules": {
          "expected": [
            "AgeAppropriate:Kids",
-            "search=spider-man%20toys",
+            "search=spider[-%20]?man%20toys",
            "S=4"
          ]
        }
--- a/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json
+++ b/evaluation_examples/examples/gimp/d16c99dc-2a1e-46f2-b350-d97c86c85c15.json
@@ -84,16 +84,34 @@
        }
      }
    ],
-    "func": "check_image_size_and_structure_sim",
-    "expected":{
+    "func": [
+      "check_image_size",
+      "check_structure_sim"
+    ],
+    "expected": [
+      {
        "type": "vm_file",
        "path": "/home/user/Desktop/dog_with_background.png",
        "dest": "dog_with_background.png"
      },
-    "result": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/resized.png",
-      "dest": "resized.png"
-    }
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/dog_with_background.png",
+        "dest": "dog_with_background.png"
+      }
+    ],
+    "result": [
+      {
+        "type": "rule",
+        "rules": {
+          "height": 512
+        }
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/resized.png",
+        "dest": "resized.png"
+      }
+    ]
  }
 }
--- a/evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
+++ b/evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
--- a/evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
+++ b/evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
--- a/evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
+++ b/evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
--- a/evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
+++ b/evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
--- a/evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
+++ b/evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
--- a/evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
+++ b/evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
--- a/evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
+++ b/evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
--- a/evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
+++ b/evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
--- a/evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
+++ b/evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
--- a/evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
+++ b/evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
--- a/evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
+++ b/evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
--- a/evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
+++ b/evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
--- a/evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
+++ b/evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
--- a/evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
+++ b/evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
--- a/evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
+++ b/evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
--- a/evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
+++ b/evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
--- a/evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
+++ b/evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
--- a/evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
+++ b/evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
--- a/evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
+++ b/evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
--- a/evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
+++ b/evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
--- a/evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
+++ b/evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
--- a/evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
+++ b/evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
--- a/evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
+++ b/evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
--- a/evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
+++ b/evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
--- a/evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
+++ b/evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
--- a/evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
+++ b/evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
--- a/evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
+++ b/evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
--- a/evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
+++ b/evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
--- a/evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
+++ b/evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
--- a/evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
+++ b/evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
--- a/evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
+++ b/evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
--- a/evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
+++ b/evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
--- a/evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
+++ b/evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
--- a/evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
+++ b/evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
--- a/evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
+++ b/evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
--- a/evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
+++ b/evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
--- a/evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
+++ b/evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
--- a/evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
+++ b/evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
--- a/evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
+++ b/evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
--- a/evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
+++ b/evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
--- a/evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
+++ b/evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
--- a/evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
+++ b/evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
--- a/evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
+++ b/evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
--- a/evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
+++ b/evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
--- a/evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
+++ b/evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
--- a/evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
+++ b/evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
--- a/evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
+++ b/evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
--- a/evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
+++ b/evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
--- a/evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
+++ b/evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
--- a/evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
+++ b/evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
--- a/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
+++ b/evaluation_examples/examples/multi_apps/02ce9a50-7af2-47ed-8596-af0c230501f8.json
@@ -1,7 +1,7 @@
 {
  "id": "02ce9a50-7af2-47ed-8596-af0c230501f8",
  "snapshot": "libreoffice_writer",
-  "instruction": "I'm using libreoffice writer to write a tutorial about linux, and now I want to show the results obtained by using the \"ls\" command in /home/user. Please run this command and save the screenshot as 'ls.png' on Desktop.",
+  "instruction": "I am currently utilizing LibreOffice Writer to compose a Linux tutorial, and I intend to display the outcomes generated by executing the \"ls\" command in /home/user. Kindly execute this command and save the screenshot as 'ls.png' on the Desktop.",
  "source": "authors",
  "config": [
    {
@@ -54,7 +54,7 @@
      "type": "rule",
      "rules": {
        "type": "text",
-        "text": "$ ls\n"
+        "text": " Ls"
      }
    }
  }
--- a/evaluation_examples/examples/multi_apps/09a37c51-e625-49f4-a514-20a773797a8a.json
+++ b/evaluation_examples/examples/multi_apps/09a37c51-e625-49f4-a514-20a773797a8a.json
@@ -1,7 +1,7 @@
 {
  "id": "09a37c51-e625-49f4-a514-20a773797a8a",
  "snapshot": "libreoffice_writer",
-  "instruction": "I received a request from my friend that he wanted me to help him modify a picture. On the Desktop is the requirement doc and the picture to be adjusted. Modify the image as he said and save modified pic as \"pic.jpg\" on Desktop. Thanks!",
+  "instruction": "I've received a request from my friend who asked for assistance in editing an image. The document with the requirements and the picture to be adjusted are on the Desktop. Please make the necessary modifications to the image as his instructions and save the edited picture as \"pic.jpg\" on the Desktop. Thank you!",
  "source": "authors",
  "config": [
    {
@@ -37,7 +37,7 @@
    "expected": {
      "type": "cloud_file",
      "path": "https://drive.usercontent.google.com/download?id=1Ee1vNyG7gGpLKK2VlLfj6PxcmdkMdvqK&export=download&authuser=0&confirm=t&uuid=1f441c5d-b62d-4850-870f-8e8f113a4091&at=APZUnTWEvKSSkuGBWzen0S9L7aHP:1709727474803",
-      "dest": "pic.jpg"
+      "dest": "pic_Gold.jpg"
    },
    "result": {
      "type": "vm_file",
--- a/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
+++ b/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
@@ -1,94 +1,109 @@
 {
-	"id": "0c825995-5b70-4526-b663-113f4c999dd2",
-	"snapshot": "libreoffice_calc",
-	"instruction": "I'm working on a comprehensive report for our environmental policy review meeting next week. I need to integrate key insights from an important document, which is a guidebook on the Green Economy, where I'm particularly interested in the 'Introduction' section. Could you extract this section and compile them into a new Google Doc named 'environment_policy_report (draft)' under /environment_policy folder? This will significantly aid in our discussion on aligning our environmental policies with sustainable and green economic practices. Thanks!",
-	"source": "authors",
-	"config": [
-		{
-            "type": "googledrive",
-            "parameters": {
-                "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
-                "operation": ["delete"],
-                "args": [
-                    {
-                        "query": "title = 'environment_policy_report (draft).doc' or title = 'environment_policy_report (draft).docx' or title = 'environment_policy_report (draft)'",
-                        "trash": false
-                    }
-                ]
-            }
-        },
-        {
-            "type": "launch",
-            "parameters": {
-                "command": [
-                    "google-chrome",
-                    "--remote-debugging-port=1337"
-                ]
-            }
-        },
-        {
-            "type": "launch",
-            "parameters": {
-                "command": [
-                    "socat",
-                    "tcp-listen:9222,fork",
-                    "tcp:localhost:1337"
-                ]
-            }
-        },
-        {
-            "type": "login",
-            "parameters": {
-                "settings_file": "evaluation_examples/settings/google/settings.json",
-                "platform": "googledrive"
-            }
-        },
-		{
-			"type": "command",
-			"parameters": {
-				"command": ["mkdir", "-p", "/home/user/Desktop/wwf"]
-			}
-		},
-		{
-			"type": "download",
-			"parameters": {
-				"files": [
-					{"path": "/home/user/Desktop/wwf/lpr_living_planet_report_2016.pdf", "url": "https://drive.google.com/uc?id=19NCdw_MVP6nH5nC6okYYe8U1mJABfTRK&export=download"},
-					{"path": "/home/user/Desktop/wwf/279c656a32_ENGLISH_FULL.pdf", "url": "https://drive.google.com/uc?id=1ckH1NetfImQ9EyONTO-ZFWA8m8VIUFvD&export=download"},
-					{"path": "/home/user/Desktop/wwf/7g37j96psg_WWF_AR2021_spreads.pdf", "url": "https://drive.google.com/uc?id=1cxLTzmqDKMomOyvho29lvFvhRnb0Y8__&export=download"},
-					{"path": "/home/user/Desktop/GE Guidebook.pdf", "url": "https://drive.google.com/uc?id=1KzC_R3eI3Rmgwz5bkcI8Ohv7ebOrU-Is&export=download"},
-					{"path": "/home/user/Desktop/assessing_and_reporting_water_quality(q&a).pdf", "url": "https://drive.google.com/uc?id=1LFojf3Weflv3fVdrZrgTY1iUaRdbT9kG&export=download"}
-				]
-			}
-		}
-	],
-	"trajectory": "trajectories/0c825995-5b70-4526-b663-113f4c999dd2",
-	"related_apps": [
-		"libreoffice_calc",
-		"chrome",
-		"os"
-    ],
-	"evaluator": {
-		"func": "compare_docx_files",
-        "result": {
-            "type": "googledrive_file",
-            "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
-            "path_list": [
-                    [
-                        "environment_policy_report (draft).docx"
-                    ]
-                ],
-          "dest": [
-                    "environment_policy_report (draft).docx"
-                ]
-        },
-        "expected": {
-            "type": "cloud_file",
-            "path": "https://drive.google.com/uc?id=1A2ti9JncAfIa6ks7FTJWHtYlZo-68FtM&export=download",
-            "dest": "environment_policy_report (draft)_gold.docx"
-        },
-        "options": {
-          "content_only": true
-        }
-	}
+  "id": "0c825995-5b70-4526-b663-113f4c999dd2",
+  "snapshot": "libreoffice_calc",
+  "instruction": "I'm working on a comprehensive report for our environmental policy review meeting next week. I need to integrate key insights from an important document, which is a guidebook on the Green Economy, where I'm particularly interested in the 'Introduction' section. Could you extract this section and compile them into a new Google Doc named 'environment_policy_report (draft)' under /environment_policy folder? This will significantly aid in our discussion on aligning our environmental policies with sustainable and green economic practices. Thanks!",
+  "source": "authors",
+  "config": [
+    {
+      "type": "googledrive",
+      "parameters": {
+        "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
+        "operation": [
+          "delete"
+        ],
+        "args": [
+          {
+            "query": "title = 'environment_policy_report (draft).doc' or title = 'environment_policy_report (draft).docx' or title = 'environment_policy_report (draft)'",
+            "trash": false
+          }
+        ]
+      }
+    },
+    {
+      "type": "launch",
+      "parameters": {
+        "command": [
+          "google-chrome",
+          "--remote-debugging-port=1337"
+        ]
+      }
+    },
+    {
+      "type": "launch",
+      "parameters": {
+        "command": [
+          "socat",
+          "tcp-listen:9222,fork",
+          "tcp:localhost:1337"
+        ]
+      }
+    },
+    {
+      "type": "login",
+      "parameters": {
+        "settings_file": "evaluation_examples/settings/google/settings.json",
+        "platform": "googledrive"
+      }
+    },
+    {
+      "type": "command",
+      "parameters": {
+        "command": [
+          "mkdir",
+          "-p",
+          "/home/user/Desktop/wwf"
+        ]
+      }
+    },
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "path": "/home/user/Desktop/wwf/lpr_living_planet_report_2016.pdf",
+            "url": "https://drive.google.com/uc?id=19NCdw_MVP6nH5nC6okYYe8U1mJABfTRK&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/wwf/279c656a32_ENGLISH_FULL.pdf",
+            "url": "https://drive.google.com/uc?id=1ckH1NetfImQ9EyONTO-ZFWA8m8VIUFvD&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/wwf/7g37j96psg_WWF_AR2021_spreads.pdf",
+            "url": "https://drive.google.com/uc?id=1cxLTzmqDKMomOyvho29lvFvhRnb0Y8__&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/GE Guidebook.pdf",
+            "url": "https://drive.google.com/uc?id=1KzC_R3eI3Rmgwz5bkcI8Ohv7ebOrU-Is&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/assessing_and_reporting_water_quality(q&a).pdf",
+            "url": "https://drive.google.com/uc?id=1LFojf3Weflv3fVdrZrgTY1iUaRdbT9kG&export=download"
+          }
+        ]
+      }
+    }
+  ],
+  "trajectory": "trajectories/0c825995-5b70-4526-b663-113f4c999dd2",
+  "related_apps": [
+    "libreoffice_calc",
+    "chrome",
+    "os"
+  ],
+  "evaluator": {
+    "func": "compare_docx_files",
+    "result": {
+      "type": "googledrive_file",
+      "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
+      "path": ["environment_policy", "environment_policy_report (draft)"],
+      "dest": "environment_policy_report (draft).docx"
+    },
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?id=1A2ti9JncAfIa6ks7FTJWHtYlZo-68FtM&export=download",
+      "dest": "environment_policy_report (draft)_gold.docx"
+    },
+    "options": {
+      "content_only": true
+    }
+  }
 }
--- a/evaluation_examples/examples/multi_apps/0e5303d4-8820-42f6-b18d-daf7e633de21.json
+++ b/evaluation_examples/examples/multi_apps/0e5303d4-8820-42f6-b18d-daf7e633de21.json
@@ -0,0 +1,99 @@
+{
+    "id": "0e5303d4-8820-42f6-b18d-daf7e633de21",
+    "snapshot": "chrome",
+    "instruction": "I want to learn python programming and my friend recommends me this course website. I have grabbed the lecture slide for week 0. Please download the PDFs for other weeks into the opened folder and leave the file name as-it-is.",
+    "source": "authors",
+    "config": [
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "google-chrome",
+                    "--remote-debugging-port=1337"
+                ]
+            }
+        },
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "socat",
+                    "tcp-listen:9222,fork",
+                    "tcp:localhost:1337"
+                ]
+            }
+        },
+        {
+            "type": "chrome_open_tabs",
+            "parameters": {
+                "urls_to_open": [
+                    "https://cs50.harvard.edu/python/2022/weeks/0/"
+                ]
+            }
+        },
+        {
+            "type": "execute",
+            "parameters": {
+                "command": [
+                    "mkdir",
+                    "-p",
+                    "/home/user/lecture_slides"
+                ]
+            }
+        },
+        {
+            "type": "download",
+            "parameters": {
+                "files": [
+                    {
+                        "url": "https://drive.usercontent.google.com/download?id=1OdvHgcHXSn62xXe_VrPTN0HLWHmrcfdY&export=download&authuser=0&confirm=t",
+                        "path": "/home/user/lecture_slides/lecture0.pdf"
+                    }
+                ]
+            }
+        },
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "nautilus",
+                    "/home/user/lecture_slides"
+                ]
+            }
+        }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps": [
+        "os",
+        "chrome"
+    ],
+    "evaluator": {
+        "postconfig": [
+            {
+                "type": "execute",
+                "parameters": {
+                    "command": [
+                        "/bin/bash",
+                        "-c",
+                        "cd /home/user && zip -qr lecture_slides.zip lecture_slides/"
+                    ]
+                }
+            }
+        ],
+        "func": "compare_archive",
+        "result": {
+            "type": "vm_file",
+            "path": "/home/user/lecture_slides.zip",
+            "dest": "lecture_slides.zip"
+        },
+        "expected": {
+            "type": "cloud_file",
+            "path": "https://drive.usercontent.google.com/download?id=1Ej2iHG8p-QJe7FZQpPIIS82BHOlFAUQM&export=download&authuser=0&confirm=t",
+            "dest": "gold_lecture_slides.zip"
+        },
+        "options": {
+            "file_path": "lecture_slides",
+            "file_type": "pdf"
+        }
+    }
+}
--- a/evaluation_examples/examples/multi_apps/1f18aa87-af6f-41ef-9853-cdb8f32ebdea.json
+++ b/evaluation_examples/examples/multi_apps/1f18aa87-af6f-41ef-9853-cdb8f32ebdea.json
@@ -0,0 +1,116 @@
+{
+  "id": "1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+  "snapshot": "libreoffice_calc",
+  "instruction": "I've prepared some grammar tests and placed them in the 'Grammar test' folder. I've already provided the multiple-choice answers for Test 1 in the 'answer doc' file. Could you please follow the same format to write out the answers for the remaining two tests in the doc file? This way, I can distribute them to the students as a reference. Thank you.",
+  "source": "authors",
+  "config": [
+    {
+      "type": "command",
+      "parameters": {
+        "command": [
+          "mkdir",
+          "-p",
+          "/home/user/Desktop/students work/",
+          "/home/user/Desktop/Lec powerpoint/",
+          "/home/user/Desktop/Grammar test/",
+          "/home/user/Desktop/Grammar rules PDF/",
+          "/home/user/Desktop/FDI/"
+        ]
+      }
+    },
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "path": "/home/user/Desktop/Grammer test 1.docx",
+            "url": "https://drive.google.com/uc?id=1VaXQ9XdzMv079xKFs0Y2XrwdmwFHIvBK&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammer test 2.docx",
+            "url": "https://drive.google.com/uc?id=1k2T88WreTwi-Yyp9mEJnreEQC3DdkJ2x&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammer test 3.docx",
+            "url": "https://drive.google.com/uc?id=1QgyQWVOcAJuPaSlrywb9nuFiQDySsTb2&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Answer.docx",
+            "url": "https://drive.google.com/uc?id=1BC2DuWJuZggmf6fXl6Ys9xQMZzU6a1br&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/irregularrules02.pdf",
+            "url": "https://drive.google.com/uc?id=1Eln9ehX6y6Df2-S_Hp7Ao1teKRu6I1Tg&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/irregularrules01.pdf",
+            "url": "https://drive.google.com/uc?id=1krdEEdNWvTwMKZU14QtI_xc2lCFVeVcl&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/fragrules.pdf",
+            "url": "https://drive.google.com/uc?id=1IXyI2KeiXsuh6XV2LelcmhZ2PDh_dBQf&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/csfsrules.pdf",
+            "url": "https://drive.google.com/uc?id=1ernwGGrjhYNoHVNAevdb2qNKQ0I5n3RP&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Public Lecture Teaching Plan.docx",
+            "url": "https://drive.google.com/uc?id=1ywfVFTEbiSkypZpzLjLmq_ppSbQIC8s8&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Course Timetable.xlsx",
+            "url": "https://drive.google.com/uc?id=1NGtahknRq_kXsXlw0tRQ1_CZp9SljoVg&export=download"
+          }
+        ]
+      }
+    }
+  ],
+  "trajectory": "trajectories/1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+  "related_apps": [
+    "os",
+    "libreoffice_writer"
+  ],
+  "evaluator": {
+    "postconfig": [
+      {
+        "type": "activate_window",
+        "parameters": {
+          "window_name": "Answer.docx - LibreOffice Writer",
+          "strict": true
+        }
+      },
+      {
+        "type": "sleep",
+        "parameters": {
+          "seconds": 0.5
+        }
+      },
+      {
+        "type": "execute",
+        "parameters": {
+          "command": [
+            "python",
+            "-c",
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
+          ]
+        }
+      }
+    ],
+    "func": "compare_docx_files",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?id=1TOMGWC3OFuP6yEGQuRJMEFWdg2NcBPSs&export=download",
+      "dest": "Answer gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "/home/user/Desktop/Answer.docx",
+      "dest": "Answer.docx"
+    },
+    "options": {
+      "ignore_case": true,
+      "ignore_blanks": true
+    }
+  }
+}
--- a/evaluation_examples/examples/multi_apps/20236825-b5df-46e7-89bf-62e1d640a897.json
+++ b/evaluation_examples/examples/multi_apps/20236825-b5df-46e7-89bf-62e1d640a897.json
@@ -1,7 +1,7 @@
 {
    "id": "20236825-b5df-46e7-89bf-62e1d640a897",
    "snapshot": "vscode",
-    "instruction": "I am coding on my algorithm practice. The doc \"bubble_Sort_tutorial.docx\" is the document for it. Help me finish the function 'bubbleSort' in 'bubbleSort.py' on the Desktop save the output in 'res.txt' on Desktop.",
+    "instruction": "I am currently working on my algorithm practice using the document \"bubble_Sort_tutorial.docx.\" Please assist me in completing the 'bubbleSort' function within the 'bubbleSort.py' file on the Desktop and save the output as 'res.txt' on the Desktop.",
    "source": "authors",
    "config": [
        {
@@ -47,7 +47,7 @@
        "expected": {
            "type": "cloud_file",
            "path": "https://drive.usercontent.google.com/download?id=1g2Trt9oxQyW_sx8aIztFA0zNsE4yNw2x&export=download&authuser=0&confirm=t&uuid=342751c4-54f1-4760-9326-e7388845ded0&at=APZUnTV5BcbaxIZrDglWbs84Oxln:1709623697315",
-            "dest": "res.txt"
+            "dest": "res_Gold.txt"
        },
        "result": {
            "type": "vm_file",
--- a/evaluation_examples/examples/multi_apps/227d2f97-562b-4ccb-ae47-a5ec9e142fbb.json
+++ b/evaluation_examples/examples/multi_apps/227d2f97-562b-4ccb-ae47-a5ec9e142fbb.json
@@ -1,7 +1,7 @@
 {
 	"id": "227d2f97-562b-4ccb-ae47-a5ec9e142fbb",
 	"snapshot": "gimp",
-	"instruction": "I have my .xcf file saved on Desktop. Could you help me copy the image and paste it into a Libreoffice Writer file? Save it as 'image.docx' on the Desktop.",
+	"instruction": "I've stored my .xcf file on the Desktop. Can you assist me in copying the image and pasting it into a LibreOffice Writer document? Save the document as 'image.docx' on the Desktop, please.",
 	"source": "authors",
 	"config": [
 		{
@@ -63,7 +63,7 @@
 		"expected": {
 			"type": "cloud_file",
 			"path": "https://drive.usercontent.google.com/download?id=11kWQc1XFEqcIMuW0-NnZRSdv1199OmVI&export=download&authuser=0&confirm=t&uuid=694676fd-1ac9-4501-8acf-f48018494c7f&at=APZUnTV-koL51ka5dHum_HpGywv_:1709618406292",
-			"dest": "image.docx"
+			"dest": "image_Gold.docx"
 		}
 	}
 }
--- a/evaluation_examples/examples/multi_apps/236833a3-5704-47fc-888c-4f298f09f799.json
+++ b/evaluation_examples/examples/multi_apps/236833a3-5704-47fc-888c-4f298f09f799.json
@@ -1,7 +1,7 @@
 {
    "id": "236833a3-5704-47fc-888c-4f298f09f799",
    "snapshot": "chrome",
-    "instruction": "Find daily papers on Huggingface and take down all the titles, authors and the abstracts of papers on 1st March, 2024 in the doc file 'paper_reading_2024_03_01.docx' on desktop. Each paragraph (split by empty lines) conforms to the following format:\nTitle: xxx\nAuthors: xxx, xxx, xxx\nAbstract: xxxxxxxx.\nArxiv PDF: https://xxxx.pdf",
+    "instruction": "Find the daily paper list on Huggingface and take down the meta information of papers on 1st March, 2024 in the opened .docx file. I have recorded two papers. Please conform to the format and complete others.",
    "source": "authors",
    "config": [
        {
@@ -31,12 +31,24 @@
                ]
            }
        },
+        {
+            "type": "download",
+            "parameters": {
+                "files": [
+                    {
+                        "url": "https://drive.usercontent.google.com/download?id=1WEDfILO-NijZBGArZ3ovO1933uHeOi1A&export=download&authuser=0&confirm=t",
+                        "path": "/home/user/Desktop/paper_reading_2024_03_01.docx"
+                    }
+                ]
+            }
+        },
        {
            "type": "launch",
            "parameters": {
                "command": [
                    "libreoffice",
-                    "--writer"
+                    "--writer",
+                    "/home/user/Desktop/paper_reading_2024_03_01.docx"
                ]
            }
        }
@@ -55,7 +67,7 @@
        },
        "expected": {
            "type": "cloud_file",
-            "path": "https://drive.usercontent.google.com/download?id=1TUTihXD93bIlekuYy_44fmXAhI1KVol4&export=download&authuser=0&confirm=t",
+            "path": "https://drive.usercontent.google.com/download?id=1wb0sQnVDCAz8sS49kO8boJIa1kqI5mx0&export=download&authuser=0&confirm=t",
            "dest": "gold_paper_reading_2024_03_01.docx"
        },
        "options": {
--- a/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
+++ b/evaluation_examples/examples/multi_apps/2373b66a-092d-44cb-bfd7-82e86e7a3b4d.json
@@ -0,0 +1,49 @@
+{
+    "id": "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
+    "snapshot": "multiapps",
+    "instruction": "I want to understand the resource usage of my Ubuntu system under normal workloads. Please use the `sar` command in the `sysstat` toolkit to monitor system activity, evaluate the status once every second for 30 seconds, output the results to \"System_Resources_Report.txt\" under Desktop.",
+    "source": "author",
+    "config": [
+          {
+            "type": "command",
+            "parameters":{
+                "command": "echo password | sudo -S apt-get update && echo password | sudo -S apt-get install sysstat",
+                "shell": "true"
+            }
+        }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps": [
+      "os", "calc"
+    ],
+    "evaluator": {
+      "func": ["file_contains", "check_line_number"],
+      "result": 
+        [
+        {
+            "type": "vm_file",
+            "path": "/home/user/Desktop/System_Resources_Report.txt",
+            "dest": "System_Resources_Report.txt"
+        },
+        {
+            "type": "vm_file",
+            "path": "/home/user/Desktop/System_Resources_Report.txt",
+            "dest": "System_Resources_Report.txt"
+        }
+        ],
+      "expected": 
+      [
+        {
+        "type": "rule",
+        "rules" :{
+            "expected": ["CPU", "%user","%nice","%system", "%iowait", "%steal", "%idle"]
+        }
+        },
+      {
+        "type": "rule",
+        "rules": {
+            "expected": "31"
+        }
+      }]
+    }
+  }
--- a/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json
+++ b/evaluation_examples/examples/multi_apps/26660ad1-6ebb-4f59-8cba-a8432dfe8d38.json
@@ -1,25 +1,69 @@
 {
-	"id": "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
-	"snapshot": "libreoffice_calc",
-	"instruction": "I want to test the quality of the network environment my laptop is currently in. Please measure my network situation through speedtest.net, export the measurement results, and save them to ~/Test/Speed.",
-	"source": "authors",
-	"config": [
-    ],
-	"trajectory": "trajectories/26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
-	"related_apps": [
-
-    ],
-	"evaluator": {
-		"postconfig": [],
-		"func": "",
-		"result": {
-
+    "id": "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
+    "snapshot": "multiapps",
+    "instruction": "I want to test the quality of the network environment my laptop is currently in. Please measure my network situation through speedtest.net, export the measurement results, and save them to ~/Test/Speed (if the dir does not exist, create it).",
+    "source": "https://www.speedtest.net/",
+    "config": [
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "google-chrome",
+                    "--remote-debugging-port=1337"
+                ]
+            }
        },
-		"expected": {
-
+        {
+            "type": "launch",
+            "parameters": {
+              "command": [
+                "socat",
+                "tcp-listen:9222,fork",
+                "tcp:localhost:1337"
+              ]
+            }            
        },
-		"options": {
-
+        {
+            "type": "chrome_open_tabs",
+            "parameters": {
+              "urls_to_open": [
+                "https://www.speedtest.net/"
+              ]
+            }
+          },
+          {
+            "type": "activate_window",
+            "parameters": {
+              "window_name": "Google Chrome"
+            }
+          },
+          {
+            "type": "execute",
+            "parameters": {
+              "command": [
+                "python",
+                "-c",
+                "import pyautogui; import time; time.sleep(0.5);"
+              ]
+            }
+          }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps":[
+        "os",
+        "browser"
+    ],
+    "evaluator":{
+        "func": "compare_time_in_speedtest_results",
+        "result":{
+            "type": "vm_file",
+            "path": "/home/user/Test/Speed/Speedtest Results Export-.csv",
+            "dest": "Speedtest Results Export-.csv",
+            "time_suffix": true
+        },
+        "expected":{
+            "type": "time_diff_range",
+            "diff_range_in_minutes": "60"
        }
-	}
-}
+    }
+}
--- a/evaluation_examples/examples/multi_apps/2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e.json
+++ b/evaluation_examples/examples/multi_apps/2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e.json
@@ -1,7 +1,7 @@
 {
  "id": "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e",
  "snapshot": "libreoffice_calc",
-  "instruction": "Could you please take a moment to review the 'case study' file located within the 'student work' folder? I'm particularly interested in ensuring that the references section at the end of the document adheres to the APA 7th edition formatting guidelines. If it turns out that the current formatting does not align with APA 7 standards, I would greatly appreciate your assistance in making the necessary adjustments to comply with those guidelines. ",
+  "instruction": "Could you please take a moment to review the 'case study' file located within the 'student work' folder? I'm particularly interested in ensuring that the references section at the end of the document adheres to the APA 7th edition formatting guidelines. Making the necessary adjustments if it turns out that the current formatting does not align with APA 7 standards or exists some errors.",
  "source": "authors",
  "config": [
    {
@@ -90,13 +90,45 @@
  "related_apps": [
  ],
  "evaluator": {
-    "postconfig": [],
-    "func": "",
-    "result": {
-    },
+    "postconfig": [
+      {
+        "type": "activate_window",
+        "parameters": {
+          "window_name": "case study.docx - LibreOffice Writer",
+          "strict": true
+        }
+      },
+      {
+        "type": "sleep",
+        "parameters": {
+          "seconds": 0.5
+        }
+      },
+      {
+        "type": "execute",
+        "parameters": {
+          "command": [
+            "python",
+            "-c",
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
+          ]
+        }
+      }
+    ],
+    "func": "compare_references",
    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?id=1325Qfch0JaJ_wJ20ICxMoHeW8KLpK8v0&export=download",
+      "dest": "case study gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "/home/user/Desktop/students work/case study.docx",
+      "dest": "case study.docx"
    },
    "options": {
+      "content_only": true,
+      "reference_base_result": 0.92
    }
  }
 }
--- a/evaluation_examples/examples/multi_apps/3a93cae4-ad3e-403e-8c12-65303b271818.json
+++ b/evaluation_examples/examples/multi_apps/3a93cae4-ad3e-403e-8c12-65303b271818.json
@@ -1,26 +1,169 @@
 {
-	"id": "3a93cae4-ad3e-403e-8c12-65303b271818",
-	"snapshot": "libreoffice_calc",
-	"instruction": "Could you please add a two-hour lecture slot to my weekly course timetable, scheduled for every Wednesday at 12 PM? It seems I accidentally omitted that when setting up my schedule. I'd appreciate you taking care of that for me. Thanks!",
-	"source": "authors",
-	"config": [
-
-    ],
-	"trajectory": "trajectories/3a93cae4-ad3e-403e-8c12-65303b271818",
-	"related_apps": [
-
-    ],
-	"evaluator": {
-		"postconfig": [],
-		"func": "",
-		"result": {
-
-        },
-		"expected": {
-
-        },
-		"options": {
-
+  "id": "3a93cae4-ad3e-403e-8c12-65303b271818",
+  "snapshot": "libreoffice_calc",
+  "instruction": "Could you please add a two-hour lecture slot to my weekly course timetable, scheduled for every Wednesday at 12 PM? It seems I accidentally omitted that when setting up my schedule. I'd appreciate you taking care of that for me. Thanks!",
+  "source": "authors",
+  "config": [
+    {
+      "type": "command",
+      "parameters": {
+        "command": [
+          "mkdir",
+          "-p",
+          "/home/user/Desktop/students work/",
+          "/home/user/Desktop/Lec powerpoint/",
+          "/home/user/Desktop/Grammar test/",
+          "/home/user/Desktop/Grammar rules PDF/",
+          "/home/user/Desktop/FDI/"
+        ]
+      }
+    },
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "path": "/home/user/Desktop/students work/Zheng He .docx",
+            "url": "https://drive.google.com/uc?id=1wI4141LAthnY5m6qcCUaGgDooe4wiTgz&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/students work/cassie.docx",
+            "url": "https://drive.google.com/uc?id=1cW9TGJy56vossXxDsdnutPyCbR70af7M&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/students work/case study.docx",
+            "url": "https://drive.google.com/uc?id=11GzpoZvp4qnL2ukXdpbhH-a3zOIHhtDx&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/irregularrules02.pdf",
+            "url": "https://drive.google.com/uc?id=1Eln9ehX6y6Df2-S_Hp7Ao1teKRu6I1Tg&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/irregularrules01.pdf",
+            "url": "https://drive.google.com/uc?id=1krdEEdNWvTwMKZU14QtI_xc2lCFVeVcl&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/fragrules.pdf",
+            "url": "https://drive.google.com/uc?id=1IXyI2KeiXsuh6XV2LelcmhZ2PDh_dBQf&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Grammar rules PDF/csfsrules.pdf",
+            "url": "https://drive.google.com/uc?id=1ernwGGrjhYNoHVNAevdb2qNKQ0I5n3RP&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Public Lecture Teaching Plan.docx",
+            "url": "https://drive.google.com/uc?id=1ywfVFTEbiSkypZpzLjLmq_ppSbQIC8s8&export=download"
+          },
+          {
+            "path": "/home/user/Desktop/Course Timetable.xlsx",
+            "url": "https://drive.google.com/uc?id=1DSjRYgofPK2jldKwIsAygz2x8XWlXCK6&export=download"
+          }
+        ]
+      }
+    }
+  ],
+  "trajectory": "trajectories/3a93cae4-ad3e-403e-8c12-65303b271818",
+  "related_apps": [
+    "os",
+    "libreoffice_calc"
+  ],
+  "evaluator": {
+    "postconfig": [
+      {
+        "type": "activate_window",
+        "parameters": {
+          "window_name": "Course Timetable.xlsx - LibreOffice Calc",
+          "strict": true
        }
-	}
+      },
+      {
+        "type": "sleep",
+        "parameters": {
+          "seconds": 0.5
+        }
+      },
+      {
+        "type": "execute",
+        "parameters": {
+          "command": [
+            "python",
+            "-c",
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5); "
+          ]
+        }
+      }
+    ],
+    "func": [
+      "compare_table",
+      "compare_table",
+      "compare_table"
+    ],
+    "result": [
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Course Timetable.xlsx",
+        "dest": "Course Timetable.xlsx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Course Timetable.xlsx",
+        "dest": "Course Timetable.xlsx"
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/Course Timetable.xlsx",
+        "dest": "Course Timetable.xlsx"
+      }
+    ],
+    "expected": [
+      {
+        "type": "cloud_file",
+        "path": "https://drive.google.com/uc?id=1VMOon8byWuoCW2Uk5etGMJLMzAfwFVyB&export=download",
+        "dest": "Course Timetable gold.xlsx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://drive.google.com/uc?id=1jAThiIqILZ5t-RFPHVniSvAL8ZJO1H3P&export=download",
+        "dest": "Course Timetable gold 2.xlsx"
+      },
+      {
+        "type": "cloud_file",
+        "path": "https://drive.google.com/uc?id=1U0THDtPCgsw-Rb0N9fjF8DeOepPeUajP&export=download",
+        "dest": "Course Timetable gold 3.xlsx"
+      }
+    ],
+    "options": [
+      {
+        "rules": [
+          {
+            "type": "sheet_data",
+            "sheet_idx0": "RNSheet1",
+            "sheet_idx1": "ENSheet1",
+            "ignore_case": true
+          }
+        ]
+      },
+      {
+        "rules": [
+          {
+            "type": "sheet_data",
+            "sheet_idx0": "RNSheet1",
+            "sheet_idx1": "ENSheet1",
+            "ignore_case": true
+          }
+        ]
+      },
+      {
+        "rules": [
+          {
+            "type": "sheet_data",
+            "sheet_idx0": "RNSheet1",
+            "sheet_idx1": "ENSheet1",
+            "ignore_case": true
+          }
+        ]
+      }
+    ],
+    "conj": "or"
+  }
 }
--- a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
+++ b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
@@ -0,0 +1,37 @@
+{
+  "id": "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
+  "snapshot": "gimp",
+  "instruction": "Download the image from \"https://drive.google.com/uc?export=download&id=1i8j5dGS57sA07jEuPNAlQW-sn5uqUnuK\", and then use GIMP to compress it to under 600KB. Resize if needed.",
+  "source": "",
+  "config": [
+    {
+      "type": "execute",
+      "parameters": {
+        "command": [
+          "python",
+          "-c",
+          "import pyautogui; import time; pyautogui.hotkey(\"ctrl\", \"alt\", \"t\"); time.sleep(0.5);"
+        ]
+      }
+    }
+  ],
+  "trajectory": "trajectories/",
+  "related_apps": [
+    "gimp",
+    "os"
+  ],
+  "evaluator": {
+    "func": "check_image_file_size",
+    "result": {
+      "type": "vm_file",
+      "path": "/home/user/Desktop/compressed.jpeg",
+      "dest": "compressed.jpeg"
+    },
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "max_size": 600000
+      }
+    }
+  }
+}
--- a/evaluation_examples/examples/multi_apps/3e3fc409-bff3-4905-bf16-c968eee3f807.json
+++ b/evaluation_examples/examples/multi_apps/3e3fc409-bff3-4905-bf16-c968eee3f807.json
@@ -0,0 +1,83 @@
+{
+    "id": "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "snapshot": "chrome",
+    "instruction": "I'm a huge movie fan and have kept a record of all the movies I've watched. I'm curious to find out if there are any films released before 2024 from the IMDB Top 30 list that I haven't seen yet. Help me create another sheet 'unseen_movies' in the opened Excel. This sheet should share the same headers and sort the results according to IMDB rankings from high to low.",
+    "source": "authors",
+    "config": [
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "google-chrome",
+                    "--remote-debugging-port=1337"
+                ]
+            }
+        },
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "socat",
+                    "tcp-listen:9222,fork",
+                    "tcp:localhost:1337"
+                ]
+            }
+        },
+        {
+            "type": "chrome_open_tabs",
+            "parameters": {
+                "urls_to_open": [
+                    "https://www.imdb.com"
+                ]
+            }
+        },
+        {
+            "type": "download",
+            "parameters": {
+                "files": [
+                    {
+                        "url": "https://drive.usercontent.google.com/download?id=1KVNVf5qZhprV_7rgEl33Qrkagv603reM&export=download&authuser=0&confirm=t",
+                        "path": "/home/user/Desktop/movies.xlsx"
+                    }
+                ]
+            }
+        },
+        {
+            "type": "launch",
+            "parameters": {
+                "command": [
+                    "libreoffice",
+                    "--calc",
+                    "/home/user/Desktop/movies.xlsx"
+                ]
+            }
+        }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps": [
+        "libreoffice_calc",
+        "chrome"
+    ],
+    "evaluator": {
+        "func": "compare_table",
+        "result": {
+            "type": "vm_file",
+            "path": "/home/user/Desktop/movies.xlsx",
+            "dest": "movies.xlsx"
+        },
+        "expected": {
+            "type": "cloud_file",
+            "path": "https://drive.usercontent.google.com/download?id=149QKswQ8AIYk21Aaatic6QSCcBU40uyd&export=download&authuser=0&confirm=t",
+            "dest": "gold_movies.xlsx"
+        },
+        "options": {
+            "rules": [
+                {
+                    "type": "sheet_data",
+                    "sheet_idx0": "RNunseen_movies",
+                    "sheet_idx1": "ENunseen_movies"
+                }
+            ]
+        }
+    }
+}
--- a/evaluation_examples/examples/multi_apps/3eb2a122-a5e3-4f89-9820-f7fa1a582969.json
+++ b/evaluation_examples/examples/multi_apps/3eb2a122-a5e3-4f89-9820-f7fa1a582969.json
@@ -0,0 +1,52 @@
+{
+    "id": "3eb2a122-a5e3-4f89-9820-f7fa1a582969",
+    "snapshot": "multiapps",
+    "instruction": "Please search online for the submission deadline and venue of the ICLR main conference in 2035, and copy it to my clipboard. If not yet publicized, copy None.",
+    "source": "author",
+    "config": [
+        {
+            "type": "launch",
+            "parameters": {
+              "command": [
+                "google-chrome",
+                "--remote-debugging-port=1337"
+              ]
+            }
+          },
+          {
+            "type": "launch",
+            "parameters": {
+              "command": [
+                "socat",
+                "tcp-listen:9222,fork",
+                "tcp:localhost:1337"
+              ]
+            }
+          },
+          {
+            "type": "command",
+            "parameters":{
+                "command": "echo password | sudo -S apt install xsel && xsel -bc",
+                "shell": "true"
+            }
+        }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps": [
+      "os", "chrome"
+    ],
+    "evaluator": {
+      "func": "is_in_vm_clickboard",
+      "expected": {
+        "type": "vm_command_line",
+        "command": "xsel --clipboard --output",
+        "shell": "true"
+      },
+      "result": {
+        "type": "rule",
+        "rules": {
+            "expected": ["None"]
+        }
+      }
+    }
+  }
--- a/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
+++ b/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
@@ -0,0 +1,71 @@
+{
+  "id": "42f4d1c7-4521-4161-b646-0a8934e36081",
+  "snapshot": "gimp",
+  "instruction": "Configure VS Code to edit GIMP script-fu scripts effectively by installing lisp extension. Test by writing code to resize the image \"character.png\" to 128 * 128 as \"resized.png\".",
+  "source": "",
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.google.com/uc?export=download&id=1yrWU5HimYPNUjdtvw1a218kh50fPVtZ3",
+            "path": "/home/user/Desktop/character.png"
+          }
+        ]
+      }
+    },
+    {
+      "type": "launch",
+      "parameters": {
+        "command": [
+          "code"
+        ]
+      }
+    }
+  ],
+  "trajectory": "trajectories/",
+  "related_apps": [
+    "gimp",
+    "vs_code"
+  ],
+  "evaluator": {
+    "func": [
+      "is_extension_installed",
+      "check_image_size"
+    ],
+    "result": [
+      {
+        "type": "vm_command_line",
+        "command": [
+          "code",
+          "--list-extensions",
+          "|",
+          "grep",
+          "mattn.lisp"
+        ]
+      },
+      {
+        "type": "vm_file",
+        "path": "/home/user/Desktop/resized.png",
+        "dest": "resized.png"
+      }
+    ],
+    "expected": [
+      {
+        "type": "rule",
+        "rules": {
+          "type": "contain",
+          "expected": "mattn.lisp"
+        }
+      },
+      {
+        "type": "rule",
+        "rules": {
+          "height": 128,
+          "width": 128
+        }
+      }
+    ]
+  }
+}
--- a/Show More
+++ b/Show More