Merge main

2024-03-18 22:21:01 +08:00
parent 4671455b56 866ac3fbd9
commit f5da5e940b
133 changed files with 1845 additions and 8812 deletions
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": [
+                "--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx"
+                // "--example_time_limit", "60"
+            ]
+        }
+    ]
+}
--- a/README.md
+++ b/README.md
@@ -21,10 +21,12 @@
 Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnngb3Wf1-RiwMMpXTftwMqP2Ztak/edit#heading=h.uh0x0tkl7fuw)

 2. Install the environment package, download the examples and the virtual machine image.
+For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands:
 ```bash
 pip install desktop-env
 gdown xxxx
-gdown xxxx
+vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui
+vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state"
 ```

 ## Quick Start
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -263,16 +263,19 @@ class PythonController:
        """
        Ends recording the screen.
        """
-        response = requests.post(self.http_server + "/end_recording")
-        if response.status_code == 200:
-            logger.info("Recording stopped successfully")
-            with open(dest, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        f.write(chunk)
-        else:
-            logger.error("Failed to stop recording. Status code: %d", response.status_code)
-            return None
+        try:
+            response = requests.post(self.http_server + "/end_recording")
+            if response.status_code == 200:
+                logger.info("Recording stopped successfully")
+                with open(dest, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+            else:
+                logger.error("Failed to stop recording. Status code: %d", response.status_code)
+                return None
+        except Exception as e:
+            logger.error("An error occurred while trying to download the recording: %s", e)

    # Additional info
    def get_vm_platform(self):
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -53,8 +53,8 @@ class DesktopEnv(gym.Env):
    def __init__(
            self,
            path_to_vm: str,
+            snapshot_name: str = "init_state",
            action_space: str = "computer_13",
-            task_config: Dict[str, Any] = None,
            tmp_dir: str = "tmp",
            cache_dir: str = "cache",
            screen_size: Tuple[int] = (1920, 1080),
@@ -64,15 +64,6 @@ class DesktopEnv(gym.Env):
        Args:
            path_to_vm (str): path to .vmx file
            action_space (str): "computer_13" | "pyautogui"
-
-            task_config (Dict[str, Any]): manages task configs integratedly,
-              including
-              * base snapshot
-              * task id (uuid)
-              * instruction
-              * setup config
-              * evaluator config
-
            tmp_dir (str): temporary directory to store trajectory stuffs like
              the extracted screenshots
            cache_dir (str): cache directory to cache task-related stuffs like
@@ -81,23 +72,20 @@ class DesktopEnv(gym.Env):

        # Initialize environment variables
        self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm)))
+        self.snapshot_name = snapshot_name
        self.tmp_dir_base: str = tmp_dir
        self.cache_dir_base: str = cache_dir
-        self.vm_screen_size = screen_size
+        self.vm_screen_size = screen_size  # todo: add the logic to get the screen size from the VM
        self.headless = headless

        os.makedirs(self.tmp_dir_base, exist_ok=True)

-        # task-aware stuffs
-        # todo: handling the logic of snapshot directory
-        self._set_task_info(task_config)
-
        # Initialize emulator and controller
        logger.info("Initializing...")
        self._start_emulator()
        self.vm_ip = self._get_vm_ip()
        self.controller = PythonController(vm_ip=self.vm_ip)
-        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir)
+        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir_base)

        # Meta info of the VM, move to the reset() function
        self.vm_platform: str = ""  # self.controller.get_vm_platform()
@@ -147,7 +135,7 @@ class DesktopEnv(gym.Env):
        raise Exception("Failed to get VM IP address!")

    def _save_state(self):
-        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_name])

    def _get_screenshot(self):
        # random_uuid = str(uuid.uuid4())
@@ -167,7 +155,6 @@ class DesktopEnv(gym.Env):
        return screenshot_image_path

    def _set_task_info(self, task_config: Dict[str, Any]):
-        self.snapshot_path = task_config["snapshot"]
        self.task_id: str = task_config["id"]
        self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
        os.makedirs(self.cache_dir, exist_ok=True)
@@ -187,7 +174,7 @@ class DesktopEnv(gym.Env):
            if isinstance(self.evaluator["func"], list) \
            else getattr(metrics, self.evaluator["func"])
        self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
-        if "result" in self.evaluator:
+        if "result" in self.evaluator and len(self.evaluator["result"])>0:
            self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
                                          self.evaluator["result"]] \
                if isinstance(self.evaluator["result"], list) \
@@ -197,7 +184,7 @@ class DesktopEnv(gym.Env):
                if isinstance(self.metric, list) \
                else None

-        if "expected" in self.evaluator:
+        if "expected" in self.evaluator and len(self.evaluator["expected"])>0:
            self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
                                            self.evaluator["expected"]] \
                if isinstance(self.evaluator["expected"], list) \
@@ -239,8 +226,8 @@ class DesktopEnv(gym.Env):
        )
        os.makedirs(os.path.join(self.tmp_dir, "screenshots"))

-        logger.info("Reverting to snapshot to {}...".format(self.snapshot_path))
-        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        logger.info("Reverting to snapshot to {}...".format(self.snapshot_name))
+        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_name])
        time.sleep(5)

        print(self.vm_screen_size)
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -114,7 +114,8 @@ from .slides import (
 )
 from .table import (
    compare_table,
-    compare_csv
+    compare_csv,
+    compare_conference_city_in_order
 )
 from .thunderbird import (
    check_thunderbird_prefs,
@@ -148,7 +149,6 @@ from .vscode import (
    check_html_background_image,
    compare_zip_files
 )
-from .calc import compare_conference_city_in_order
 from .others import compare_epub, check_mp3_meta

 def infeasible():
--- a/desktop_env/evaluators/metrics/basic_os.py
+++ b/desktop_env/evaluators/metrics/basic_os.py
@@ -1,6 +1,3 @@
-import subprocess
-
-
 def check_gnome_favorite_apps(apps_str: str, rule):
    # parse the string like "['thunderbird.desktop', 'vim.desktop', 'google-chrome.desktop']"
    # to a list of strings
@@ -57,6 +54,7 @@ def check_moved_jpgs(directory_list, rule):
    else:
        return 0

+
 def is_in_vm_clickboard(config, terminal_output):
    print("terminal_output: ")
    print(terminal_output)
@@ -67,4 +65,4 @@ def is_in_vm_clickboard(config, terminal_output):
    if not isinstance(expected_results, list):
        return 1 if expected_results in terminal_output else 0
    else:
-        return 1 if all(result in terminal_output for result in expected_results) else 0
+        return 1 if all(result in terminal_output for result in expected_results) else 0
--- a/desktop_env/evaluators/metrics/calc.py
+++ b/desktop_env/evaluators/metrics/calc.py
@@ -1,41 +0,0 @@
-import logging
-from typing import List
-
-import openpyxl
-
-logger = logging.getLogger("desktopenv.metrics.calc")
-
-
-def compare_conference_city_in_order(actual_city_list_path, expected_city):
-    expected_city_list = expected_city["expected"]
-    wb = openpyxl.load_workbook(actual_city_list_path)
-    sheet = wb.active
-    actual_city_list = []
-    for row in sheet["C2:C22"]:
-        for cell in row:
-            actual_city_list.append(cell.value)
-    # expected_city is the city that we want to compare with the actual city list
-    # must in order index
-    # debug
-    try:
-        for i in range(len(actual_city_list)):
-            if isinstance(expected_city_list[i], str):
-                if expected_city_list[i] not in actual_city_list[i]:
-                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    return 0.
-
-
-            elif isinstance(expected_city_list[i], List):
-                if not any(possible_str in actual_city_list[i] for possible_str in expected_city_list[i]):
-                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    return 0.
-
-            else:
-                raise TypeError("Expected city should be a string or a list of strings")
-
-    except:
-        return 0.
-
-    return 1.
--- a/desktop_env/evaluators/metrics/demo.py
+++ b/desktop_env/evaluators/metrics/demo.py
@@ -1,28 +0,0 @@
-import fitz  # PyMuPDF
-
-def extract_answers_from_pdf(pdf_file):
-    # 打开PDF文件
-    doc = fitz.open(pdf_file)
-    answers = []
-
-    # 遍历每一页
-    for page in doc:
-        # 提取当前页的文本
-        text = page.get_text() 
-        # 分割文本为行
-        lines = text.split('\n')
-        for line in lines:
-            if line.strip():  # 排除空白行
-                # 分割等号，提取答案
-                parts = line.split('=')
-                if len(parts) > 1:
-                    answer = parts[-1].strip()  # 取等号后的部分为答案
-                    answers.append(answer)
-    
-    return answers
-
-# 假设你的文件名是'math_problems.pdf'
-pdf_file = '/Users/lxc/Desktop/calculus.pdf'
-answers = extract_answers_from_pdf(pdf_file)
-for i, answer in enumerate(answers, 1):
-    print(f"题目{i}的答案是: {answer}")
--- a/desktop_env/evaluators/metrics/libreoffice.py
+++ b/desktop_env/evaluators/metrics/libreoffice.py
@@ -26,13 +26,3 @@ def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
                     for ptn in rules["locale_set"]
                     )
                 )
-
-
-if __name__ == "__main__":
-    path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu"
-    print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*"
-        , "pt-*", "es-*", "it-*"
-                                                    ]
-                                     }
-                             )
-          )
--- a/desktop_env/evaluators/metrics/others.py
+++ b/desktop_env/evaluators/metrics/others.py
@@ -1,20 +1,20 @@
-import zipfile
-import os.path
+import logging
 import os
+import os.path
+import zipfile
+from typing import List, Dict
+from typing import Union, TypeVar

 import lxml.html
 from lxml.html import HtmlElement
-from typing import List, Dict
-from typing import Union, TypeVar
 from mutagen.easyid3 import EasyID3

 from .general import diff_text_file
 from .utils import _match_value_to_rule

-import logging
-
 logger = logging.getLogger("desktopenv.metric.others")

+
 def process_epub(filename: str) -> List[str]:
    file_list: List[str] = []

@@ -23,7 +23,7 @@ def process_epub(filename: str) -> List[str]:

    try:
        with zipfile.ZipFile(filename, "r") as z_f:
-            with z_f.open("toc.ncx") as in_f\
+            with z_f.open("toc.ncx") as in_f \
                    , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
                contents: str = in_f.read().decode()
                contents = contents.splitlines()
@@ -31,7 +31,7 @@ def process_epub(filename: str) -> List[str]:
                    if "navPoint" not in l:
                        out_f.write(l + "\n")
            file_list.append(os.path.join(base_dir, "toc.ncx"))
-            with z_f.open("content.opf") as in_f\
+            with z_f.open("content.opf") as in_f \
                    , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
                contents: str = in_f.read().decode()
                contents = contents.splitlines()
@@ -41,14 +41,14 @@ def process_epub(filename: str) -> List[str]:
            file_list.append(os.path.join(base_dir, "content.opf"))
            for f_n in z_f.namelist():
                if f_n.endswith(".html"):
-                    with z_f.open(f_n) as in_f\
+                    with z_f.open(f_n) as in_f \
                            , open(os.path.join(base_dir, f_n), "w") as out_f:
                        html: HtmlElement = lxml.html.fromstring(
-                                                ''.join( filter( lambda ch: ch!="\n" and ch!="\r"
-                                                               , in_f.read().decode()
-                                                               )
-                                                       ).encode()
-                                              )
+                            ''.join(filter(lambda ch: ch != "\n" and ch != "\r"
+                                           , in_f.read().decode()
+                                           )
+                                    ).encode()
+                        )
                        out_f.write(lxml.html.tostring(html, pretty_print=True, encoding="unicode"))
                    file_list.append(os.path.join(base_dir, f_n))
        logger.debug("%s: %s", filename, file_list)
@@ -56,6 +56,7 @@ def process_epub(filename: str) -> List[str]:
    except zipfile.BadZipFile:
        return []

+
 def compare_epub(result: str, expected: str) -> float:
    if result is None:
        return 0.
@@ -69,8 +70,10 @@ def compare_epub(result: str, expected: str) -> float:
        metric *= current_metric
    return metric

+
 V = TypeVar("Value")

+
 def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bool:
    # checks using _match_value_to_rule
    if result is None:
@@ -85,44 +88,3 @@ def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bo
        logger.debug("%s.%s: %s", result, k, value)
        metric = metric and _match_value_to_rule(value, r)
    return float(metric)
-
-if __name__ == "__main__":
-    import datetime
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    metric = check_mp3_meta( "snapshots/test/cache/3f05f3b9-29ba-4b6b-95aa-2204697ffc06/Cheng Xiang - Missing You - gt.mp3"
-                           , { "title": { "method": "eq"
-                                        , "ref": "Missing You"
-                                        }
-                             , "artist": { "method": "eq"
-                                         , "ref": "Cheng Xiang"
-                                         }
-                             }
-                           )
-    print(metric)
--- a/desktop_env/evaluators/metrics/pdf.py
+++ b/desktop_env/evaluators/metrics/pdf.py
@@ -2,6 +2,7 @@ import operator
 from typing import Any
 from typing import Dict

+import fitz  # PyMuPDF
 from pypdf import PdfReader


@@ -11,3 +12,20 @@ def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
    reader = PdfReader(pdf_file)
    nb_pages: int = len(reader.pages)
    return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
+
+
+def extract_answers_from_pdf(pdf_file):
+    doc = fitz.open(pdf_file)
+    answers = []
+
+    for page in doc:
+        text = page.get_text()
+        lines = text.split('\n')
+        for line in lines:
+            if line.strip():
+                parts = line.split('=')
+                if len(parts) > 1:
+                    answer = parts[-1].strip()
+                    answers.append(answer)
+
+    return answers
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -165,23 +165,24 @@ def compare_pptx_files(file1_path, file2_path, **options):
    # compare the content of each slide
    for slide1, slide2 in zip(prs1.slides, prs2.slides):
        slide_idx += 1
+
        def get_slide_background_color(slide):
            background = slide.background
            if background.fill.background():
                return background.fill.fore_color.rgb
            else:
                return None
-            
+
        if get_slide_background_color(slide1) != get_slide_background_color(slide2) and examine_background_color:
            return 0
-        
+
        def get_slide_notes(slide):
            notes_slide = slide.notes_slide
            if notes_slide:
                return notes_slide.notes_text_frame.text
            else:
                return None
-        
+
        if get_slide_notes(slide1).strip() != get_slide_notes(slide2).strip() and examine_note:
            return 0
        # check if the shapes are the same
@@ -192,14 +193,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_table_bottom_position:
                if slide_idx == 3 and shape1.shape_type == 19 and shape2.shape_type == 19:
                    if shape1.top <= shape2.top or shape1.top < 3600000:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-                    
+
            if examine_right_position:
                if slide_idx == 2 and not hasattr(shape1, "text") and not hasattr(shape2, "text"):
                    if shape1.left <= shape2.left or shape1.left < 4320000:
@@ -207,28 +208,31 @@ def compare_pptx_files(file1_path, file2_path, **options):

            if examine_top_position:
                if slide_idx == 2 and shape1.shape_type == 13 and shape2.shape_type == 13:
-                        if shape1.top >= shape2.top or shape1.top > 1980000:
-                            return 0
+                    if shape1.top >= shape2.top or shape1.top > 1980000:
+                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_shape_for_shift_size:
                if shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
-                    if not (hasattr(shape1, "text") and hasattr(shape2, "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
+                    if not (hasattr(shape1, "text") and hasattr(shape2,
+                                                                "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
                        return 0
-                                
-            if (shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
+
+            if (
+                    shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
                return 0
-            
+
            if examine_image_size:
                if shape1.shape_type == 13 and shape2.shape_type == 13:
                    if shape1.width != shape2.width or shape1.height != shape2.height:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                    return 0
-            
+
            if examine_modify_height:
-                if not hasattr(shape1, "text") and not hasattr(shape2, "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
+                if not hasattr(shape1, "text") and not hasattr(shape2,
+                                                               "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
                    if shape1.height != shape2.height:
                        return 0
                elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
@@ -236,13 +240,13 @@ def compare_pptx_files(file1_path, file2_path, **options):

            if hasattr(shape1, "text") and hasattr(shape2, "text"):
                if shape1.text.strip() != shape2.text.strip() and examine_text:
-                    return 0    
-                
-                # check if the paragraphs are the same
+                    return 0
+
+                    # check if the paragraphs are the same
                for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
                    if para1.alignment != para2.alignment and examine_alignment:
                        return 0
-                        
+
                    # check if the runs are the same
                    if para1.text != para2.text and examine_text:
                        return 0
@@ -253,7 +257,7 @@ def compare_pptx_files(file1_path, file2_path, **options):
                    for run1, run2 in zip(para1.runs, para2.runs):

                        # check if the font properties are the same                        
-                        if run1.font.name != run2.font.name and examine_font_name:                            
+                        if run1.font.name != run2.font.name and examine_font_name:
                            return 0

                        if run1.font.size != run2.font.size and examine_font_size:
@@ -305,10 +309,9 @@ def compare_pptx_files(file1_path, file2_path, **options):

                            return bullets

-                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(run2.part.blob.decode('utf-8')):
+                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(
+                                run2.part.blob.decode('utf-8')):
                            return 0
-                        
-

                    # fixme: Actually there are more properties to be compared, we can add them later via parsing the xml data

@@ -524,15 +527,3 @@ def check_auto_saving_time(pptx_file, rules):
        logger.error(f"Error parsing XML: {e}")
    except FileNotFoundError:
        logger.error(f"File not found: {pptx_file}")
-
-
-if __name__ == '__main__':
-    # print(compare_pptx_files(
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx",
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx"))
-    # print(evaluate_presentation_fill_to_rgb_distance(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\3b27600c-3668-4abd-8f84-7bcdebbccbdb\lec17-gui-events.pptx", {"rgb": (0, 0, 255)}))
-    # print(check_auto_saving_time(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\2cd43775-7085-45d8-89fa-9e35c0a915cf\registrymodifications.xcu", {"minutes": 3}))
-    print(compare_pptx_files(
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6_Gold.pptx",
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6.pptx",
-        examine_shape=False))
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -11,15 +11,15 @@ import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.cell.cell import Cell
-from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.worksheet.worksheet import Worksheet
+from rapidfuzz import fuzz

 from desktop_env.evaluators.metrics.utils import _match_value_to_rule, _read_cell_style, read_cell_value
 from desktop_env.evaluators.metrics.utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles \
    , load_filters, load_pivot_tables
-from rapidfuzz import fuzz

 # from openpyxl.utils import coordinate_to_tuple

@@ -165,7 +165,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:
            logger.debug("Sheet1: \n%s", str(sheet1))
            logger.debug("Sheet2: \n%s", str(sheet2))
            try:
-                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1==sheet2))
+                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1 == sheet2))
            except:
                logger.debug("Sheet1 =/v= Sheet2")
            logger.debug("Assertion: %s =v= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
@@ -231,14 +231,14 @@ def compare_table(result: str, expected: str = None, **options) -> float:
                            value1 = value1.lower()
                            value2 = value2.lower()

-                        if rl["type"]=="includes":
+                        if rl["type"] == "includes":
                            metric: bool = value2 in value1
-                        elif rl["type"]=="included_by":
+                        elif rl["type"] == "included_by":
                            metric: bool = value1 in value2
-                        elif rl["type"]=="fuzzy_match":
+                        elif rl["type"] == "fuzzy_match":
                            metric: bool = fuzz.ratio(value1, value2) >= rl.get("threshold", 85.)
-                        elif rl["type"]=="exact_match":
-                            metric: bool = value1==value2
+                        elif rl["type"] == "exact_match":
+                            metric: bool = value1 == value2
                        total_metric = total_metric and metric

            metric: bool = total_metric
@@ -409,7 +409,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:

            filters1: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
            filters2: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = filters1==filters2
+            metric: bool = filters1 == filters2
            logger.debug("Assertion: %s[filter] == %s[filter] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Filters # 

@@ -421,7 +421,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:

            pivots1: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
            pivots2: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = pivots1==pivots2
+            metric: bool = pivots1 == pivots2
            logger.debug("Assertion: %s[pivot]==%s[pivot] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
            #  }}} Compare Pivot Tables # 

@@ -482,81 +482,36 @@ def compare_csv(result: str, expected: str, **options) -> float:
    return float(metric)


-if __name__ == '__main__':
-    import datetime
-    import sys
+def compare_conference_city_in_order(actual_city_list_path, expected_city):
+    expected_city_list = expected_city["expected"]
+    wb = openpyxl.load_workbook(actual_city_list_path)
+    sheet = wb.active
+    actual_city_list = []
+    for row in sheet["C2:C22"]:
+        for cell in row:
+            actual_city_list.append(cell.value)
+    # expected_city is the city that we want to compare with the actual city list
+    # must in order index
+    # debug
+    try:
+        for i in range(len(actual_city_list)):
+            if isinstance(expected_city_list[i], str):
+                if expected_city_list[i] not in actual_city_list[i]:
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.

-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)

-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            elif isinstance(expected_city_list[i], List):
+                if not any(possible_str in actual_city_list[i] for possible_str in expected_city_list[i]):
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.

-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
+            else:
+                raise TypeError("Expected city should be a string or a list of strings")

-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
+    except:
+        return 0.

-    formatter = logging.Formatter(
-        fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    path1 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday.xlsx"
-    path2 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday_gold.xlsx"
-    rules = [ { "type": "sheet_data"
-              , "sheet_idx0": 0
-              , "sheet_idx1": "EI0"
-              }
-            ]
-    print(compare_table(path1, path2
-                        , rules=rules
-                        )
-          )
-    print(compare_table(path2, path2
-                        , rules=rules
-                        )
-          )
-
-    # Row Properties
-    # path1 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA_gold.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path1)
-    # worksheet: Worksheet = workbook.active
-    # for r_no, dms in worksheet.column_dimensions.items():
-    # print(r_no, type(r_no), type(dms), dms.hidden)
-
-    # Conditional Formats
-    # import formulas
-    # path1 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold.xlsx"
-    # path3 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold_test.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path2)
-    # worksheet: Worksheet = workbook.active
-    # print(worksheet.conditional_formatting)
-    # for itm in worksheet.conditional_formatting:
-    # print(itm.cells)
-    # for r in itm.rules:
-    # print( r.type, r.formula, r.dxf.font.color.rgb
-    # , r.dxf.fill.fgColor.rgb, r.dxf.fill.bgColor.rgb
-    # )
-    # condition = formulas.Parser().ast("=" + r.formula[0])[1].compile()
-    ##print(r.type, r.operator, r.dxfId, r.dxf)
-    # for r in itm.cells:
-    # for c in r.cells:
-    # value = worksheet.cell(row=c[0], column=c[1]).value
-    # print(value, condition(str(value)))
+    return 1.
--- a/desktop_env/evaluators/metrics/thunderbird.py
+++ b/desktop_env/evaluators/metrics/thunderbird.py
@@ -1,17 +1,19 @@
+import json
+import logging
+import re
 from typing import List, Pattern, Dict, Match
 from typing import Union, Any, TypeVar, Callable

-import re
-import json
 from .utils import _match_record
 from .utils import _match_value_to_rule as _match_pref

-import logging
 logger = logging.getLogger("desktopenv.metric.thunderbird")

 V = TypeVar("Value")

 _pref_pattern: Pattern[str] = re.compile(r'^user_pref\("(?P<key>(?:[^"]|\\")+)\", (?P<val>.+)\);$');
+
+
 def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any]]]):
    """
    Args:
@@ -51,10 +53,10 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any
                continue

            key: str = match_.group("key")
-            #value: str = match_.group("val")
-            #if value in {"true", "false"}:
-                #value = value.title()
-            #value: V = eval(value)
+            # value: str = match_.group("val")
+            # if value in {"true", "false"}:
+            # value = value.title()
+            # value: V = eval(value)
            value = json.loads(match_.group("val"))
            if key in expect_rules:
                logger.debug("K: %s, V: %s", key, repr(value))
@@ -64,9 +66,13 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any

    return float(all(expect_metrics.values()) and unexpect_metric)

+
 _value_processor: Callable[[str], str] = lambda val: val.replace("\\\"", "\"").replace("\\\\", "\\")
-#_condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
-_condition_pattern: Pattern[str] = re.compile(r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+# _condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
+_condition_pattern: Pattern[str] = re.compile(
+    r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+
+
 def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
    """
    Args:
@@ -112,8 +118,8 @@ def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]
                condition_str: str = _value_processor(l[11:-2])
                logger.debug("FILTER CONDITION: %s", condition_str)

-                conditions: List[str] =\
-                        _condition_pattern.findall(condition_str)
+                conditions: List[str] = \
+                    _condition_pattern.findall(condition_str)
                logger.debug("FILTER CONDITIONS: %s", repr(conditions))

                filter_["condition"] = conditions
@@ -138,6 +144,7 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
        remove_deleted (bool): ignore deleted messages which has status code 0008 or 0009. default: True
        remove_duplicate (bool): remove duplicate messages. default: True
    """
+
    def normalize_msg(msg, options):
        ignore_status = options.get('ignore_status', False)
        ignore_keys = options.get('ignore_keys', False)
@@ -167,66 +174,3 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
        mail2 = read_thunderbird_folder_file(gold)
        if mail1 != mail2: return .0
    return 1.0
-
-
-if __name__ == "__main__":
-    #import lxml.etree
-    #from lxml.cssselect import CSSSelector
-    #from lxml.etree import _Element
-
-    #xml = "../../任务数据/Thunderbird/vertical-card-view.xml"
-    #xml = "../../任务数据/Thunderbird/vertical-table-view.xml"
-    #at: _Element = lxml.etree.parse(xml)
-
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] page-tab-list')(at) # page tab tags
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section:nth-child(3)')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>section[attr|class~="tree-table-header"]>table-row>column-header[name=Subject]>push-button', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>tree>tree-item>section[name="Subject"]>section>section', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #print(len(elements))
-    #for elm in elements:
-        #print(lxml.etree.tostring(elm, encoding="unicode", pretty_print=True))
-
-    import datetime
-    import os
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    print( check_thunderbird_filter( "../../任务数据/Thunderbird/msgFilterRules.dat"
-                                   , { "expect": [ { "enabled": "yes"
-                                                   , "action": "Move to folder"
-                                                   , "actionValue": "mailbox://nobody@Local%20Folders/Promotions"
-                                                   , "condition": ["AND (subject,contains,discount)"]
-                                                   }
-                                                 ]
-                                     }
-                                   )
-        )
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -236,6 +236,9 @@ def check_html_background_image(src_path: str, rule: Dict = None) -> float:
    Check if the background image is correctly set.
    multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
    """
+    if not src_path:
+        return 0.0
+
    from bs4 import BeautifulSoup
    with open(src_path, 'r') as f:
        html_content = f.read()
@@ -252,6 +255,9 @@ def compare_result_files(src_path, tgt_path):
    Compare whether the content of two files are the same.
    multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85
    """
+    if not src_path or not tgt_path:
+        return 0.0
+
    with open(src_path, 'r') as f:
        src_content = f.read().strip()
    with open(tgt_path, 'r') as f:
@@ -271,12 +277,3 @@ def compare_result_files(src_path, tgt_path):
        if src_content == tgt_content:
            return 1.0
    return 0.0
-
-
-if __name__ == "__main__":
-    src_path = "../../../cache/bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108/index.html"
-    rule = {
-        "type:": "value",
-        "value": "anmi_sharper.png"
-    }
-    print(check_html_background_image(src_path, rule))
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -63,7 +63,7 @@ def execute_command():

    # Execute the command without any safety checks.
    try:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True)
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120)
        return jsonify({
            'status': 'success',
            'output': result.stdout,
@@ -117,7 +117,7 @@ def launch_app():
 def capture_screen_with_cursor():
    # fixme: when running on virtual machines, the cursor is not captured, don't know why

-    file_path = os.path.join("screenshots", "screenshot.png")
+    file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
    user_platform = platform.system()

    # Ensure the screenshots directory exists
@@ -284,6 +284,15 @@ def _create_atspi_node(node: Accessible, depth: int = 0, flag: Optional[str] = N
        text = text.replace("\ufffc", "").replace("\ufffd", "")
    #  }}} Text # 

+    #  Image {{{ # 
+    try:
+        node.queryImage()
+    except NotImplementedError:
+        pass
+    else:
+        attribute_dict["image"] = "true"
+    #  }}} Image # 
+
    #  Selection {{{ # 
    try:
        node.querySelection()
--- a/desktop_env/server/osbench_server.service
+++ b/desktop_env/server/osbench_server.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=:1"
+
+[Install]
+WantedBy=graphical.target
--- a/desktop_env/server/osbench_server@.service
+++ b/desktop_env/server/osbench_server@.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=%i"
+
+[Install]
+WantedBy=graphical.target
--- a/evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
+++ b/evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
--- a/evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
+++ b/evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
--- a/evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
+++ b/evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
--- a/evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
+++ b/evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
--- a/evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
+++ b/evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
--- a/evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
+++ b/evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
--- a/evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
+++ b/evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
--- a/evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
+++ b/evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
--- a/evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
+++ b/evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
--- a/evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
+++ b/evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
--- a/evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
+++ b/evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
--- a/evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
+++ b/evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
--- a/evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
+++ b/evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
--- a/evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
+++ b/evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
--- a/evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
+++ b/evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
--- a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
+++ b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
@@ -10,10 +10,6 @@
    "libreoffice_calc"
  ],
  "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
  }
-}
+}
--- a/evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
+++ b/evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
--- a/evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
+++ b/evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
--- a/evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
+++ b/evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
--- a/evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
+++ b/evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
--- a/evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
+++ b/evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
--- a/evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
+++ b/evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
--- a/evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
+++ b/evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
--- a/evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
+++ b/evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
--- a/evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
+++ b/evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
--- a/evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
+++ b/evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
--- a/evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
+++ b/evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
--- a/evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
+++ b/evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
--- a/evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
+++ b/evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
--- a/evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
+++ b/evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
--- a/evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
+++ b/evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
--- a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
+++ b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
@@ -10,10 +10,6 @@
    "libreoffice_calc"
  ],
  "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
  }
-}
+}
--- a/evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
+++ b/evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
--- a/evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
+++ b/evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
--- a/evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
+++ b/evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
--- a/evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
+++ b/evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
--- a/evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
+++ b/evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
--- a/evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
+++ b/evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
--- a/evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
+++ b/evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
--- a/evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
+++ b/evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
--- a/evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
+++ b/evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
--- a/evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
+++ b/evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
--- a/evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
+++ b/evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
--- a/evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
+++ b/evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
--- a/evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
+++ b/evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
--- a/evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
+++ b/evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
--- a/evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
+++ b/evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
--- a/evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
+++ b/evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
--- a/evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
+++ b/evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
--- a/evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
+++ b/evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
--- a/evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
+++ b/evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
--- a/evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
+++ b/evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
--- a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
+++ b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
@@ -63,6 +63,12 @@
      "type": "vm_file",
      "path": "/home/user/Desktop/saa-format-guide.pptx",
      "dest": "saa-format-guide.pptx"
+    },
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "color": "red"
+      }
    }
  }
 }
--- a/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
+++ b/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
@@ -94,7 +94,7 @@
    "result": {
      "type": "googledrive_file",
      "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
-      "path": "environment_policy_report (draft).docx",
+      "path": ["environment_policy", "environment_policy_report (draft)"],
      "dest": "environment_policy_report (draft).docx"
    },
    "expected": {
--- a/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json
+++ b/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json
@@ -9,7 +9,7 @@
            "parameters": {
              "files": [
                {
-                  "url": "https://drive.usercontent.google.com/download?id=104pg3yochKyH2Uvlp3BdvKmHgYmSIESu&export=download&authuser=0&confirm=t&uuid=d1926366-4e54-4a44-8dcd-fc49ed6524d7&at=APZUnTXcBFV9kcacsA0toU83lMKJ:1706505549057d",
+                  "url": "https://drive.usercontent.google.com/download?id=1gqqY56robX1tb4YPa3Yk1d72T_k-Rgz3&export=download&authuser=0&confirm=t",
                  "path": "/home/user/Desktop/15-MB-docx-file-download.docx"
                }
              ]
--- a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
+++ b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
@@ -1,7 +1,7 @@
 {
  "id": "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
  "snapshot": "gimp",
-  "instruction": "Use `gdown` to download the image from \"https://drive.google.com/uc?export=download&id=1i8j5dGS57sA07jEuPNAlQW-sn5uqUnuK\", and then use GIMP to compress it to under 600KB. Resize if needed.",
+  "instruction": "Download the image from \"https://drive.google.com/uc?export=download&id=1i8j5dGS57sA07jEuPNAlQW-sn5uqUnuK\", and then use GIMP to compress it to under 600KB as \"compressed.jpeg\" on the Desktop. Resize if needed.",
  "source": "",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
+++ b/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
@@ -1,7 +1,7 @@
 {
  "id": "42f4d1c7-4521-4161-b646-0a8934e36081",
  "snapshot": "gimp",
-  "instruction": "Configure VS Code to edit GIMP script-fu scripts effectively by installing lisp extension. Test by writing code to resizing the image as 128 * 128 as \"resized.png\"",
+  "instruction": "Configure VS Code to edit GIMP script-fu scripts effectively by installing lisp extension. Test by writing code to resize the image \"character.png\" to 128 * 128 as \"resized.png\".",
  "source": "",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
+++ b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
@@ -30,12 +30,12 @@
    ],
    "evaluator": {
        "func": "check_brightness_decrease_and_structure_sim",
-        "expected": {
+        "result": {
            "type": "vm_file",
            "path": "/home/user/Desktop/background.png",
            "dest": "background.png"
        },
-        "result": {
+        "expected": {
            "type": "cloud_file",
            "path": "https://drive.usercontent.google.com/download?id=13if1UwZ5ay6ADAVW2jp3rcyvAEBse6MJ&export=download&authuser=0&confirm=t&uuid=2ea03068-1874-4240-baa1-f8bb2f917a99&at=APZUnTXq6dVlASg819jCaI1A-rm2:1710136385956",
            "dest": "image_original.png"
--- a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
+++ b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
@@ -9,7 +9,7 @@
            "parameters": {
              "files": [
                {
-                  "url": "https://drive.usercontent.google.com/download?id=1e12nL_V7bffaLSocQ86EiGCdygzggWeu&export=download",
+                  "url": "https://drive.usercontent.google.com/download?id=1epTcblcYh8j_wFtA-aiXPIF2Oo1IVw8A&export=download",
                  "path": "/home/user/Desktop/Dickinson_Slides.pptx"
                }
              ]
@@ -36,7 +36,7 @@
        },
        "expected": {
            "type": "cloud_file",
-            "path": "https://drive.usercontent.google.com/download?id=1Xl6tgQ0K5qA1BDA2fKTK2xFLzXwbtkZ6&export=download",
+            "path": "https://drive.usercontent.google.com/download?id=1vUvaQLJUtFgbZi7lSzl0y0TS_WecFczm&export=download",
            "dest": "notes_gold.docx"
        },
        "options": {
--- a/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json
+++ b/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json
@@ -11,10 +11,6 @@
          {
            "url": "https://drive.google.com/uc?export=download&id=1bmSRNNh4JkF6izrKrmynUHarf0pFES50",
            "path": "/home/user/Desktop/cola.png"
-          },
-          {
-            "url": "https://drive.google.com/uc?export=download&id=1MayrIPJWRK7cMEVe3TxYmgkAbVMrYcQA",
-            "path": "/home/user/Desktop/cropped_gold.png"
          }
        ]
      }
@@ -43,8 +39,8 @@
      "dest": "cropped.png"
    },
    "expected": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/cropped_gold.png",
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?export=download&id=1MayrIPJWRK7cMEVe3TxYmgkAbVMrYcQA",
      "dest": "cropped_gold.png"
    }
  }
--- a/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json
+++ b/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json
@@ -1,7 +1,7 @@
 {
  "id": "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
  "snapshot": "vs_code",
-  "instruction": "Merge the contents of all .txt files from your vscode project into a single document in Writer. No merging separator is needed. Ensure to set the overall font size of the document to 10.",
+  "instruction": "Merge the contents of all .txt files from your vscode project into a single document \"concat.docx\" on Desktop with libreoffice writer. No merging separator is needed. Ensure to set the overall font size of the document to 10.",
  "source": "",
  "config": [
    {
--- a/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json
+++ b/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json
@@ -38,7 +38,7 @@
 			}
 		},
 		{
-			"type": "execute",
+			"type": "launch",
 			"parameters": {
 				"command": [
 					"nautilus",
@@ -109,4 +109,4 @@
 			]
 		}
 	}
-}
+}
--- a/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json
+++ b/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json
@@ -11,10 +11,6 @@
          {
            "url": "https://drive.google.com/uc?export=download&id=1CPGW_OZsfSWDdTU7CFrTjpzSAASyLy4w",
            "path": "/home/user/Desktop/tilearray.png"
-          },
-          {
-            "url": "https://drive.google.com/uc?export=download&id=1aHwmnxL2CKEh_FhVpevY452-BQH2t5rG",
-            "path": "/home/user/Desktop/rearranged_gold.png"
          }
        ]
      }
@@ -43,8 +39,8 @@
      "dest": "rearranged.png"
    },
    "expected": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/rearranged_gold.png",
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?export=download&id=1aHwmnxL2CKEh_FhVpevY452-BQH2t5rG",
      "dest": "rearranged_gold.png"
    }
  }
--- a/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json
+++ b/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json
@@ -1,13 +1,17 @@
 {
  "id": "e2392362-125e-4f76-a2ee-524b183a3412",
  "snapshot": "chrome",
-  "instruction": "I recently started using the famous personal academic homepage template from academicpages.github.io to build my own personal homepage, and I have cloned it to my local ~/Code/Website folder. According to an online tutorial, I can configure my name and contact information in the _config.yaml file. However, I am not familiar with the YAML file format. Please help me find the sections related to the name and contact information in this file and change them to “Test Account” and “Test@gmail.com”.",
+  "instruction": "I recently started using the famous personal academic homepage template from academicpages.github.io to build my own personal homepage, and I have cloned it to my local ~/Code/Website folder. According to an online tutorial, I can configure my name and contact information in the _config.yaml file. However, I am not familiar with the YAML file format. Please help me find the sections related to the name and contact information in this file and change them to \"Test Account\" and \"Test@gmail.com\".",
  "source": "authors",
  "config": [
    {
      "type": "command",
      "parameters": {
-        "command": ["mkdir", "-p", "/home/user/Code/Website"]
+        "command": [
+          "mkdir",
+          "-p",
+          "/home/user/Code/Website"
+        ]
      }
    },
    {
@@ -24,13 +28,22 @@
    {
      "type": "execute",
      "parameters": {
-        "command": ["tar", "-xJvf", ".tmp.tar.xz", "-C", "/home/user/Code/Website/"]
+        "command": [
+          "tar",
+          "-xJvf",
+          ".tmp.tar.xz",
+          "-C",
+          "/home/user/Code/Website/"
+        ]
      }
    },
    {
      "type": "launch",
      "parameters": {
-        "command": ["google-chrome", "--remote-debugging-port=1337"]
+        "command": [
+          "google-chrome",
+          "--remote-debugging-port=1337"
+        ]
      }
    },
    {
@@ -46,31 +59,59 @@
    {
      "type": "chrome_open_tabs",
      "parameters": {
-        "urls_to_open": ["https://academicpages.github.io/"]
+        "urls_to_open": [
+          "https://academicpages.github.io/"
+        ]
      }
    }
  ],
  "trajectory": "trajectories/e2392362-125e-4f76-a2ee-524b183a3412",
-  "related_apps": ["chrome", "os", "vscode"],
+  "related_apps": [
+    "chrome",
+    "os",
+    "vscode"
+  ],
  "evaluator": {
+    "postconfig": [
+      {
+        "type": "execute",
+        "parameters": {
+          "command": [
+            "python",
+            "-c",
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5);"
+          ]
+        }
+      }
+    ],
    "func": "check_json",
-    "options": {"is_yaml": true},
+    "options": {
+      "is_yaml": true
+    },
    "expected": {
      "type": "rule",
      "rules": {
        "expect": [
          {
-            "key": ["name"],
+            "key": [
+              "name"
+            ],
            "method": "eq",
            "ref": "Test Account"
          },
          {
-            "key": ["author", "name"],
+            "key": [
+              "author",
+              "name"
+            ],
            "method": "eq",
            "ref": "Test Account"
          },
          {
-            "key": ["author", "email"],
+            "key": [
+              "author",
+              "email"
+            ],
            "method": "eq",
            "ref": "Test@gmail.com"
          }
@@ -83,4 +124,4 @@
      "dest": "_config.yaml"
    }
  }
-}
+}
--- a/evaluation_examples/settings/googledrive/credentials.json
+++ b/evaluation_examples/settings/googledrive/credentials.json
@@ -1 +1 @@
-{"access_token": "ya29.a0Ad52N3969wUkQepy6SBOSw9Gjg4-MNPfEUBD3OZpajVfs9wL4DbfImk-5XawHjBkTdCKKBqG5R9XIX6KvvUzQDfB2BwVwb0MfLfLJDLALia7MRdPn4j6GAES372u3bSqJNNPMwVZA9j-THb3o5svJiKcJgwcoFKeKC_xaCgYKAScSARISFQHGX2MioJPeGh_8OM6z1_BujwRe3Q0171", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-08T17:16:15Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N3969wUkQepy6SBOSw9Gjg4-MNPfEUBD3OZpajVfs9wL4DbfImk-5XawHjBkTdCKKBqG5R9XIX6KvvUzQDfB2BwVwb0MfLfLJDLALia7MRdPn4j6GAES372u3bSqJNNPMwVZA9j-THb3o5svJiKcJgwcoFKeKC_xaCgYKAScSARISFQHGX2MioJPeGh_8OM6z1_BujwRe3Q0171", "expires_in": 3599, "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
+{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
--- a/evaluation_examples/test_all.json
+++ b/evaluation_examples/test_all.json
@@ -0,0 +1,398 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
+    "06fe7178-4491-4589-810f-2e2bc9502122",
+    "e1e75309-3ddb-4d09-92ec-de869c928143",
+    "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
+    "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
+    "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
+    "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
+    "480bcfea-d68f-4aaa-a0a9-2589ef319381",
+    "af630914-714e-4a24-a7bb-f9af687d3b91",
+    "3720f614-37fd-4d04-8a6b-76f54f8c222d",
+    "99146c54-4f37-4ab8-9327-5f3291665e1e",
+    "12086550-11c0-466b-b367-1d9e75b3910e",
+    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
+    "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9",
+    "ae78f875-5b98-4907-bbb5-9c737fc68c03",
+    "3299584d-8f11-4457-bf4c-ce98f7600250",
+    "030eeff7-b492-4218-b312-701ec99ee0cc",
+    "9656a811-9b5b-4ddf-99c7-5117bcef0626",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "a96b564e-dbe9-42c3-9ccf-b4498073438a",
+    "1704f00f-79e6-43a7-961b-cedd3724d5fd",
+    "f3b19d1e-2d48-44e9-b4e1-defcae1a0197",
+    "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a",
+    "47543840-672a-467d-80df-8f7c3b9788c9",
+    "c1fa57f3-c3db-4596-8f09-020701085416",
+    "da46d875-6b82-4681-9284-653b0c7ae241",
+    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc",
+    "f79439ad-3ee8-4f99-a518-0eb60e5652b0",
+    "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8",
+    "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805",
+    "7f52cab9-535c-4835-ac8c-391ee64dc930",
+    "82279c77-8fc6-46f6-9622-3ba96f61b477",
+    "2888b4e6-5b47-4b57-8bf5-c73827890774",
+    "b4f95342-463e-4179-8c3f-193cd7241fb2",
+    "f5d96daf-83a8-4c86-9686-bada31fc66ab",
+    "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba",
+    "368d9ba4-203c-40c1-9fa3-da2f1430ce63",
+    "59155008-fe71-45ec-8a8f-dc35497b6aa8",
+    "a728a36e-8bf1-4bb6-9a03-ef039a5233f0",
+    "b070486d-e161-459b-aa2b-ef442d973b92",
+    "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
+    "9f935cce-0a9f-435f-8007-817732bfc0a5",
+    "f0b971a1-6831-4b9b-a50e-22a6e47f45ba",
+    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
+  ],
+  "gimp": [
+    "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
+    "554785e9-4523-4e7a-b8e1-8016f565f56a",
+    "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
+    "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
+    "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
+    "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
+    "b148e375-fe0b-4bec-90e7-38632b0d73c2",
+    "a746add2-cab0-4740-ac36-c3769d9bfb46",
+    "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
+    "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
+    "06ca5602-62ca-47f6-ad4f-da151cde54cc",
+    "e2dd0213-26db-4349-abe5-d5667bfd725c",
+    "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
+    "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
+    "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
+    "734d6579-c07d-47a8-9ae2-13339795476b",
+    "e19bd559-633b-4b02-940f-d946248f088e",
+    "38f48d40-764e-4e77-a7cf-51dfce880291",
+    "fbb548ca-c2a6-4601-9204-e39a2efc507b",
+    "5ca86c6f-f317-49d8-b6a7-b527541caae8",
+    "62f7fd55-0687-4a43-b6e1-3eda16fc6252",
+    "8ea73f6f-9689-42ad-8c60-195bbf06a7ba",
+    "58d3eeeb-e9d0-499f-962e-fd0db2a744d8",
+    "2e6f678f-472d-4c55-99cc-8e7c5c402a71",
+    "045bf3ff-9077-4b86-b483-a1040a949cff",
+    "dbbf4b99-2253-4b10-9274-45f246af2466"
+  ],
+  "libreoffice_calc": [
+    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "42e0a640-4f19-4b28-973d-729602b5a4a7",
+    "51719eea-10bc-4246-a428-ac7c433dd4b3",
+    "1954cced-e748-45c4-9c26-9855b97fbc5e",
+    "2bd59342-0664-4ccb-ba87-79379096cc08",
+    "3aaa4e37-dc91-482e-99af-132a612d40f3",
+    "1273e544-688f-496b-8d89-3e0f40aa0606",
+    "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371",
+    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
+    "535364ea-05bd-46ea-9937-9f55c68507e8",
+    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
+    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c",
+    "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f",
+    "0bf05a7d-b28b-44d2-955a-50b41e24012a",
+    "6054afcb-5bab-4702-90a0-b259b5d3217c",
+    "abed40dc-063f-4598-8ba5-9fe749c0615d",
+    "37608790-6147-45d0-9f20-1137bb35703d",
+    "26a8440e-c166-4c50-aef4-bfb77314b46b",
+    "d681960f-7bc3-4286-9913-a8812ba3261a",
+    "035f41ba-6653-43ab-aa63-c86d449d62e5",
+    "7efeb4b1-3d19-4762-b163-63328d66303b",
+    "1de60575-bb6e-4c3d-9e6a-2fa699f9f197",
+    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
+    "51b11269-2ca8-4b2a-9163-f21758420e78",
+    "1e8df695-bd1b-45b3-b557-e7d599cf7597",
+    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
+    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
+    "a01fbce3-2793-461f-ab86-43680ccbae25",
+    "0326d92d-d218-48a8-9ca1-981cd6d064c7",
+    "0a2e43bf-b26c-4631-a966-af9dfa12c9e5",
+    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
+    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
+    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
+    "1d17d234-e39d-4ed7-b46f-4417922a4e7c",
+    "4e6fcf72-daf3-439f-a232-c434ce416af6",
+    "01b269ae-2111-4a07-81fd-3fcd711993b0",
+    "21df9241-f8d7-4509-b7f1-37e501a823f7",
+    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
+    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
+    "7a4e4bc8-922c-4c84-865c-25ba34136be1",
+    "4de54231-e4b5-49e3-b2ba-61a0bec721c0",
+    "30e3e107-1cfb-46ee-a755-2cd080d7ba6a",
+    "4172ea6e-6b77-4edb-a9cc-c0014bd1603b",
+    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
+    "3a7c8185-25c1-4941-bd7b-96e823c9f21f",
+    "21ab7b40-77c2-4ae6-8321-e00d3a086c73"
+  ],
+  "libreoffice_impress": [
+    "5d901039-a89c-4bfb-967b-bf66f4df075e",
+    "550ce7e7-747b-495f-b122-acdc4d0b8e54",
+    "455d3c66-7dc6-4537-a39a-36d3e9119df7",
+    "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
+    "c59742c0-4323-4b9d-8a02-723c251deaa0",
+    "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
+    "9ec204e4-f0a3-42f8-8458-b772a6797cab",
+    "0f84bef9-9790-432e-92b7-eece357603fb",
+    "ce88f674-ab7a-43da-9201-468d38539e4a",
+    "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
+    "a097acff-6266-4291-9fbd-137af7ecd439",
+    "bf4e9888-f10f-47af-8dba-76413038b73c",
+    "21760ecb-8f62-40d2-8d85-0cee5725cb72",
+    "ac9bb6cb-1888-43ab-81e4-a98a547918cd",
+    "2cd43775-7085-45d8-89fa-9e35c0a915cf",
+    "358aa0a7-6677-453f-ae35-e440f004c31e",
+    "a669ef01-ded5-4099-9ea9-25e99b569840",
+    "73c99fb9-f828-43ce-b87a-01dc07faa224",
+    "15aece23-a215-4579-91b4-69eec72e18da",
+    "986fc832-6af2-417c-8845-9272b3a1528b",
+    "a434992a-89df-4577-925c-0c58b747f0f4",
+    "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8",
+    "841b50aa-df53-47bd-a73a-22d3a9f73160",
+    "8979838c-54a5-4454-a2b8-3d135a1a5c8f",
+    "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb",
+    "2b94c692-6abb-48ae-ab0b-b3e8a19cb340",
+    "9cf05d24-6bd9-4dae-8967-f67d88f5d38a",
+    "08aced46-45a2-48d7-993b-ed3fb5b32302",
+    "edb61b14-a854-4bf5-a075-c8075c11293a",
+    "c82632a4-56b6-4db4-9dd1-3820ee3388e4",
+    "39be0d19-634d-4475-8768-09c130f5425d",
+    "ac1b39ff-ee4d-4483-abce-c117e98942f0",
+    "f23acfd2-c485-4b7c-a1e7-d4303ddfe864",
+    "70bca0cc-c117-427e-b0be-4df7299ebeb6",
+    "af2d657a-e6b3-4c6a-9f67-9e3ed015974c",
+    "57667013-ea97-417c-9dce-2713091e6e2a",
+    "0a211154-fda0-48d0-9274-eaac4ce5486d",
+    "a53f80cd-4a90-4490-8310-097b011433f6",
+    "7ae48c60-f143-4119-b659-15b8f485eb9a",
+    "5cfb9197-e72b-454b-900e-c06b0c802b40",
+    "05dd4c1d-c489-4c85-8389-a7836c4f0567",
+    "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1",
+    "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a",
+    "e4ef0baf-4b52-4590-a47e-d4d464cca2d7",
+    "ed43c15f-00cb-4054-9c95-62c880865d68",
+    "3161d64e-3120-47b4-aaad-6a764a92493b",
+    "04578141-1d42-4146-b9cf-6fab4ce5fd74"
+  ],
+  "libreoffice_writer": [
+    "0810415c-bde4-4443-9047-d5f70165a697",
+    "0a0faba3-5580-44df-965d-f562a99b291c",
+    "0b17a146-2934-46c7-8727-73ff6b6483e8",
+    "0e47de2a-32e0-456c-a366-8c607ef7a9d2",
+    "0e763496-b6bb-4508-a427-fad0b6c3e195",
+    "3ef2b351-8a84-4ff2-8724-d86eae9b842e",
+    "4bcb1253-a636-4df4-8cb0-a35c04dfef31",
+    "66399b0d-8fda-4618-95c4-bfc6191617e9",
+    "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2",
+    "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
+    "6f81754e-285d-4ce0-b59e-af7edb02d108",
+    "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
+    "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
+    "88fe4b2d-3040-4c70-9a70-546a47764b48",
+    "936321ce-5236-426a-9a20-e0e3c5dc536f",
+    "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
+    "b21acd93-60fd-4127-8a43-2f5178f4a830",
+    "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
+    "e246f6d8-78d7-44ac-b668-fcf47946cb50",
+    "e528b65e-1107-4b8c-8988-490e4fece599",
+    "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
+    "f178a4a9-d090-4b56-bc4c-4b72a61a035d",
+    "bb8ccc78-479f-4a2f-a71e-d565e439436b"
+  ],
+  "multi_apps": [
+    "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
+    "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
+    "2fe4b718-3bd7-46ec-bdce-b184f5653624",
+    "3680a5ee-6870-426a-a997-eba929a0d25c",
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "510f64c8-9bcc-4be1-8d30-638705850618",
+    "51f5801c-18b3-4f25-b0c3-02f85507a078",
+    "58565672-7bfe-48ab-b828-db349231de6b",
+    "78aed49a-a710-4321-a793-b611a7c5b56b",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "937087b6-f668-4ba6-9110-60682ee33441",
+    "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
+    "b52b40a5-ad70-4c53-b5b0-5650a8387052",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618",
+    "d9b7c649-c975-4f53-88f5-940b29c47247",
+    "e135df7c-7687-4ac0-a5f0-76b74438b53e",
+    "ee9a3c83-f437-4879-8918-be5efbb9fac7",
+    "f7dfbef3-7697-431c-883a-db8583a4e4f9",
+    "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
+    "6d72aad6-187a-4392-a4c4-ed87269c51cf",
+    "f918266a-b3e0-4914-865d-4faa564f1aef",
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "bc2b57f3-686d-4ec9-87ce-edf850b7e442",
+    "74d5859f-ed66-4d3e-aa0e-93d7a592ce41",
+    "b5062e3e-641c-4e3a-907b-ac864d2e7652",
+    "00fa164e-2612-4439-992e-157d019a8436",
+    "acb0f96b-e27c-44d8-b55f-7cb76609dfcd",
+    "69acbb55-d945-4927-a87b-8480e1a5bb7e",
+    "48d05431-6cd5-4e76-82eb-12b60d823f7d",
+    "68a25bd4-59c7-4f4d-975e-da0c8509c848",
+    "eb303e01-261e-4972-8c07-c9b4e7a4922a",
+    "0c825995-5b70-4526-b663-113f4c999dd2",
+    "c7c1e4c3-9e92-4eba-a4b8-689953975ea4",
+    "d1acdb87-bb67-4f30-84aa-990e56a09c92",
+    "deec51c9-3b1e-4b9e-993c-4776f20e8bb2",
+    "8e116af7-7db7-4e35-a68b-b0939c066c78",
+    "337d318b-aa07-4f4f-b763-89d9a2dd013f",
+    "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
+    "185f29bd-5da0-40a6-b69c-ba7f4e0324ef",
+    "869de13e-bef9-4b91-ba51-f6708c40b096",
+    "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e",
+    "3a93cae4-ad3e-403e-8c12-65303b271818",
+    "1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+    "26150609-0da3-4a7d-8868-0faf9c5f01bb",
+    "9219480b-3aed-47fc-8bac-d2cffc5849f7",
+    "881deb30-9549-4583-a841-8270c65f2a17",
+    "7e287123-70ca-47b9-8521-47db09b69b14",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "5bc63fb9-276a-4439-a7c1-9dc76401737f",
+    "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
+    "a82b78bb-7fde-4cb3-94a4-035baf10bcf0",
+    "36037439-2044-4b50-b9d1-875b5a332143",
+    "716a6079-22da-47f1-ba73-c9d58f986a38",
+    "873cafdd-a581-47f6-8b33-b9696ddb7b05",
+    "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a",
+    "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a",
+    "da922383-bfa4-4cd3-bbad-6bebab3d7742",
+    "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
+    "81c425f5-78f3-4771-afd6-3d2973825947",
+    "bb83cab4-e5c7-42c7-a67b-e46068032b86",
+    "227d2f97-562b-4ccb-ae47-a5ec9e142fbb",
+    "b337d106-053f-4d37-8da0-7f9c4043a66b",
+    "20236825-b5df-46e7-89bf-62e1d640a897",
+    "8df7e444-8e06-4f93-8a1a-c5c974269d82",
+    "aad10cd7-9337-4b62-b704-a857848cedf2",
+    "02ce9a50-7af2-47ed-8596-af0c230501f8",
+    "4c26e3f3-3a14-4d86-b44a-d3cedebbb487",
+    "a503b07f-9119-456b-b75d-f5146737d24f",
+    "09a37c51-e625-49f4-a514-20a773797a8a",
+    "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "f5c13cdd-205c-4719-a562-348ae5cd1d91",
+    "5990457f-2adb-467b-a4af-5c857c92d762",
+    "415ef462-bed3-493a-ac36-ca8c6d23bf1b",
+    "7ff48d5b-2df2-49da-b500-a5150ffc7f18",
+    "9f3bb592-209d-43bc-bb47-d77d9df56504",
+    "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0",
+    "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac",
+    "3f05f3b9-29ba-4b6b-95aa-2204697ffc06",
+    "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56",
+    "f8369178-fafe-40c2-adc4-b9b08a125456",
+    "778efd0a-153f-4842-9214-f05fc176b877",
+    "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5",
+    "c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
+    "788b3701-3ec9-4b67-b679-418bfa726c22",
+    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
+    "42d25c08-fb87-4927-8b65-93631280a26f",
+    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
+    "e8172110-ec08-421b-a6f5-842e6451911f",
+    "42f4d1c7-4521-4161-b646-0a8934e36081",
+    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
+    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
+    "91190194-f406-4cd6-b3f9-c43fac942b22",
+    "7f35355e-02a6-45b5-b140-f0be698bcf85",
+    "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
+    "0e5303d4-8820-42f6-b18d-daf7e633de21",
+    "df67aebb-fb3a-44fd-b75b-51b6012df509",
+    "5df7b33a-9f77-4101-823e-02f863e1c1ae",
+    "aceb0368-56b8-4073-b70e-3dc9aee184e0",
+    "22a4636f-8179-4357-8e87-d1743ece1f81",
+    "236833a3-5704-47fc-888c-4f298f09f799",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+  ],
+  "os": [
+    "94d95f96-9699-4208-98ba-3c3119edf9c2",
+    "bedcedc4-4d72-425e-ad62-21960b11fe0d",
+    "43c2d64c-bab5-4dcb-a30c-b888321c319a",
+    "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
+    "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
+    "a462a795-fdc7-4b23-b689-e8b6df786b78",
+    "f9be0997-4b7c-45c5-b05c-4612b44a6118",
+    "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
+    "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "e0df059f-28a6-4169-924f-b9623e7184cc",
+    "ddc75b62-7311-4af8-bfb3-859558542b36",
+    "b6781586-6346-41cd-935a-a6b1487918fc",
+    "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa",
+    "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
+    "fe41f596-a71b-4c2f-9b2f-9dcd40b568c3",
+    "a4d98375-215b-4a4d-aee9-3d4370fccc41",
+    "13584542-872b-42d8-b299-866967b5c3ef",
+    "23393935-50c7-4a86-aeea-2b78fd089c5c",
+    "5812b315-e7bd-4265-b51f-863c02174c28",
+    "c288e301-e626-4b98-a1ab-159dcb162af5",
+    "cc9d4f34-1ca0-4a1b-8ff2-09302696acb9",
+    "c56de254-a3ec-414e-81a6-83d2ce8c41fa",
+    "4783cc41-c03c-4e1b-89b4-50658f642bd5",
+    "5c1075ca-bb34-46a3-a7a0-029bd7463e79",
+    "5ced85fc-fa1a-4217-95fd-0fb530545ce2",
+    "37887e8c-da15-4192-923c-08fa390a176d",
+    "4127319a-8b79-4410-b58a-7a151e15f3d7",
+    "4d117223-a354-47fb-8b45-62ab1390a95f",
+    "6f56bf42-85b8-4fbb-8e06-6c44960184ba"
+  ],
+  "thunderbird": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
+    "12086550-11c0-466b-b367-1d9e75b3910e",
+    "06fe7178-4491-4589-810f-2e2bc9502122",
+    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
+    "e1e75309-3ddb-4d09-92ec-de869c928143",
+    "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
+    "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "d088f539-cab4-4f9a-ac92-9999fc3a656e",
+    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
+    "480bcfea-d68f-4aaa-a0a9-2589ef319381",
+    "030eeff7-b492-4218-b312-701ec99ee0cc",
+    "94760984-3ff5-41ee-8347-cf1af709fea0",
+    "99146c54-4f37-4ab8-9327-5f3291665e1e",
+    "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
+    "8f080098-ddb1-424c-b438-4e96e5e4786e",
+    "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
+    "fba2c100-79e8-42df-ae74-b592418d54f4",
+    "efcf0d81-0835-4880-b2fd-d866e8bc2294",
+    "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
+    "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
+    "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
+    "9195653c-f4aa-453d-aa95-787f6ccfaae9",
+    "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
+    "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
+    "5ac2891a-eacd-4954-b339-98abba077adb",
+    "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
+    "215dfd39-f493-4bc3-a027-8a97d72c61bf",
+    "cb130f0d-d36f-4302-9838-b3baf46139b6",
+    "7882ed6e-bece-4bf0-bada-c32dc1ddae72"
+  ],
+  "vs_code": [
+    "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
+    "eabc805a-bfcf-4460-b250-ac92135819f6",
+    "982d12a5-beab-424f-8d38-d2a48429e511",
+    "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
+    "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
+    "9439a27b-18ae-42d8-9778-5f68f891805e",
+    "ae506c68-352c-4094-9caa-ee9d42052317",
+    "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
+    "930fdb3b-11a8-46fe-9bac-577332e2640e",
+    "276cc624-87ea-4f08-ab93-f770e3790175",
+    "9d425400-e9b2-4424-9a4b-d4c7abac4140",
+    "5e2d93d8-8ad0-4435-b150-1692aacaa994",
+    "6ed0a554-cbee-4b44-84ea-fd6c042f4fe1",
+    "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5",
+    "70745df8-f2f5-42bd-8074-fbc10334fcc5",
+    "57242fad-77ca-454f-b71b-f187181a9f23",
+    "c6bf789c-ba3a-4209-971d-b63abf0ab733",
+    "0512bb38-d531-4acf-9e7e-0add90816068",
+    "847a96b6-df94-4927-97e6-8cc9ea66ced7",
+    "7aeae0e2-70ee-4705-821d-1bba5d5b2ddd",
+    "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3",
+    "7c4cc09e-7a92-40dd-8338-b2286535c4ed",
+    "971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6"
+  ]
+}
--- a/evaluation_examples/test_small.json
+++ b/evaluation_examples/test_small.json
@@ -0,0 +1,102 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
+  ],
+  "gimp": [
+    "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
+    "554785e9-4523-4e7a-b8e1-8016f565f56a"
+  ],
+  "libreoffice_calc": [
+    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "42e0a640-4f19-4b28-973d-729602b5a4a7"
+  ],
+  "libreoffice_impress": [
+    "5d901039-a89c-4bfb-967b-bf66f4df075e",
+    "550ce7e7-747b-495f-b122-acdc4d0b8e54"
+  ],
+  "libreoffice_writer": [
+    "0810415c-bde4-4443-9047-d5f70165a697",
+    "0a0faba3-5580-44df-965d-f562a99b291c"
+  ],
+  "multi_apps": [
+    "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "510f64c8-9bcc-4be1-8d30-638705850618",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618",
+    "e135df7c-7687-4ac0-a5f0-76b74438b53e",
+    "f7dfbef3-7697-431c-883a-db8583a4e4f9",
+    "6d72aad6-187a-4392-a4c4-ed87269c51cf",
+    "f918266a-b3e0-4914-865d-4faa564f1aef",
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "74d5859f-ed66-4d3e-aa0e-93d7a592ce41",
+    "b5062e3e-641c-4e3a-907b-ac864d2e7652",
+    "48d05431-6cd5-4e76-82eb-12b60d823f7d",
+    "eb303e01-261e-4972-8c07-c9b4e7a4922a",
+    "d1acdb87-bb67-4f30-84aa-990e56a09c92",
+    "deec51c9-3b1e-4b9e-993c-4776f20e8bb2",
+    "8e116af7-7db7-4e35-a68b-b0939c066c78",
+    "185f29bd-5da0-40a6-b69c-ba7f4e0324ef",
+    "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e",
+    "3a93cae4-ad3e-403e-8c12-65303b271818",
+    "1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+    "26150609-0da3-4a7d-8868-0faf9c5f01bb",
+    "7e287123-70ca-47b9-8521-47db09b69b14",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
+    "a82b78bb-7fde-4cb3-94a4-035baf10bcf0",
+    "36037439-2044-4b50-b9d1-875b5a332143",
+    "716a6079-22da-47f1-ba73-c9d58f986a38",
+    "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a",
+    "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a",
+    "da922383-bfa4-4cd3-bbad-6bebab3d7742",
+    "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
+    "81c425f5-78f3-4771-afd6-3d2973825947",
+    "227d2f97-562b-4ccb-ae47-a5ec9e142fbb",
+    "20236825-b5df-46e7-89bf-62e1d640a897",
+    "02ce9a50-7af2-47ed-8596-af0c230501f8",
+    "4c26e3f3-3a14-4d86-b44a-d3cedebbb487",
+    "09a37c51-e625-49f4-a514-20a773797a8a",
+    "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "415ef462-bed3-493a-ac36-ca8c6d23bf1b",
+    "9f3bb592-209d-43bc-bb47-d77d9df56504",
+    "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0",
+    "3f05f3b9-29ba-4b6b-95aa-2204697ffc06",
+    "f8369178-fafe-40c2-adc4-b9b08a125456",
+    "778efd0a-153f-4842-9214-f05fc176b877",
+    "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5",
+    "c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
+    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
+    "42d25c08-fb87-4927-8b65-93631280a26f",
+    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
+    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
+    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
+    "91190194-f406-4cd6-b3f9-c43fac942b22",
+    "7f35355e-02a6-45b5-b140-f0be698bcf85",
+    "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
+    "df67aebb-fb3a-44fd-b75b-51b6012df509",
+    "5df7b33a-9f77-4101-823e-02f863e1c1ae",
+    "22a4636f-8179-4357-8e87-d1743ece1f81",
+    "236833a3-5704-47fc-888c-4f298f09f799"
+  ],
+  "os": [
+    "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "5812b315-e7bd-4265-b51f-863c02174c28",
+    "43c2d64c-bab5-4dcb-a30c-b888321c319a",
+    "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82"
+  ],
+  "thunderbird": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "8f080098-ddb1-424c-b438-4e96e5e4786e"
+  ],
+  "vs_code": [
+    "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "53ad5833-3455-407b-bbc6-45b4c79ab8fb"
+  ]
+}
--- a/experiment_a11y_tree.py
+++ b/experiment_a11y_tree.py
@@ -1,432 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model="gpt-4-0125-preview"):
-    action_space = "pyautogui"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], max_tokens=1000,
-                        action_space=action_space, exp="a11y_tree")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="a11y_tree")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    os_list = [
-        "94d95f96-9699-4208-98ba-3c3119edf9c2",
-        "bedcedc4-4d72-425e-ad62-21960b11fe0d",
-        "43c2d64c-bab5-4dcb-a30c-b888321c319a",
-        "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
-        "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
-        "f9be0997-4b7c-45c5-b05c-4612b44a6118",
-        "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
-        "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
-        "e0df059f-28a6-4169-924f-b9623e7184cc",
-        "ddc75b62-7311-4af8-bfb3-859558542b36",
-        "b6781586-6346-41cd-935a-a6b1487918fc",
-        "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
-        "a4d98375-215b-4a4d-aee9-3d4370fccc41",
-        "13584542-872b-42d8-b299-866967b5c3ef",
-        "23393935-50c7-4a86-aeea-2b78fd089c5c"
-    ]
-
-    # for example_id in os_list:
-    #     try:
-    #         main("os", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vlc_list = [
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8f080098-ddb1-424c-b438-4e96e5e4786e",
-        "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
-        "fba2c100-79e8-42df-ae74-b592418d54f4",
-        "efcf0d81-0835-4880-b2fd-d866e8bc2294",
-        "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
-        "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
-        "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
-        "9195653c-f4aa-453d-aa95-787f6ccfaae9",
-        "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
-        "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
-        "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
-        "215dfd39-f493-4bc3-a027-8a97d72c61bf"
-    ]
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-
-    calc_list = [
-        "eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
-        "0bf05a7d-b28b-44d2-955a-50b41e24012a",
-        "7a4e4bc8-922c-4c84-865c-25ba34136be1",
-        "2bd59342-0664-4ccb-ba87-79379096cc08",
-        "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-        "7efeb4b1-3d19-4762-b163-63328d66303b",
-        "4e6fcf72-daf3-439f-a232-c434ce416af6",
-        "6054afcb-5bab-4702-90a0-b259b5d3217c",
-        "abed40dc-063f-4598-8ba5-9fe749c0615d",
-        "01b269ae-2111-4a07-81fd-3fcd711993b0",
-        "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-        "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-        "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-        "51b11269-2ca8-4b2a-9163-f21758420e78",
-        "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-        "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-        "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-        "3aaa4e37-dc91-482e-99af-132a612d40f3",
-        "37608790-6147-45d0-9f20-1137bb35703d",
-        "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-        "d681960f-7bc3-4286-9913-a8812ba3261a",
-        "21df9241-f8d7-4509-b7f1-37e501a823f7",
-        "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-        "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-        "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-        "a01fbce3-2793-461f-ab86-43680ccbae25",
-        "4f07fbe9-70de-4927-a4d5-bb28bc12c52c",
-    ]
-
-    # for example_id in calc_list:
-    #     main("libreoffice_calc", example_id)
-
-    impress_list = [
-        "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        "0f84bef9-9790-432e-92b7-eece357603fb",
-        "ce88f674-ab7a-43da-9201-468d38539e4a",
-        "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        "a097acff-6266-4291-9fbd-137af7ecd439",
-        "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    thunderbird_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-    # for example_id in thunderbird_list:
-    #     main("thunderbird", example_id)
-
-    gimp_list = [
-        "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
-        "554785e9-4523-4e7a-b8e1-8016f565f56a",
-        "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
-        "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
-        "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
-        "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
-        "b148e375-fe0b-4bec-90e7-38632b0d73c2",
-        "a746add2-cab0-4740-ac36-c3769d9bfb46",
-        "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
-        "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
-        "06ca5602-62ca-47f6-ad4f-da151cde54cc",
-        "e2dd0213-26db-4349-abe5-d5667bfd725c",
-        "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
-        "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
-        "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
-        "734d6579-c07d-47a8-9ae2-13339795476b"
-    ]
-
-    # for example_id in gimp_list:
-    #     try:
-    #         main("gimp", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    from tqdm import tqdm
-
-    # for example_id in tqdm(vlc_list):
-    #     try:
-    #         main("vlc", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    # for example_id in tqdm(chrome_list):
-    #     try:
-    #         main("chrome", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    vs_code_list = [
-        # "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        # "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        # "eabc805a-bfcf-4460-b250-ac92135819f6",
-        # "982d12a5-beab-424f-8d38-d2a48429e511",
-        # "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        # "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        # "9439a27b-18ae-42d8-9778-5f68f891805e",
-        # "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        # "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        # "276cc624-87ea-4f08-ab93-f770e3790175",
-        # "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    for example_id in tqdm(vs_code_list):
-        try:
-            main("vs_code", example_id, gpt4_model="gpt-3.5-turbo-16k")
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    thunderbird_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-
-    # for example_id in tqdm(thunderbird_list):
-    #     try:
-    #         main("thunderbird", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    multiple_list = [
-        # "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-        # "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-        "2fe4b718-3bd7-46ec-bdce-b184f5653624",
-        "3680a5ee-6870-426a-a997-eba929a0d25c",
-        # "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-        # "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-        # "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-        # "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-        # "51f5801c-18b3-4f25-b0c3-02f85507a078",
-        "58565672-7bfe-48ab-b828-db349231de6b",
-        # "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-        # "510f64c8-9bcc-4be1-8d30-638705850618",
-        # "937087b6-f668-4ba6-9110-60682ee33441",
-        # "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        # "3680a5ee-6870-426a-a997-eba929a0d25c",
-        # "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-        "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        # "58565672-7bfe-48ab-b828-db349231de6b",
-        # "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    ]
-
-    for example_id in multiple_list:
-        try:
-            main("multi_apps", example_id, gpt4_model="gpt-3.5-turbo-16k")
-        except Exception as e:
-            logger.error("An error occurred while running the example: %s", e)
-            continue
-
--- a/experiment_screenshot.py
+++ b/experiment_screenshot.py
@@ -1,306 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-import time
-import func_timeout
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-# from mm_agents.gemini_pro_agent import GeminiPro_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model = "gpt-4-vision-preview"):
-    action_space = "pyautogui"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space,
-                        exp="screenshot")
-    #
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    chrome_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        # "06fe7178-4491-4589-810f-2e2bc9502122",
-        # "e1e75309-3ddb-4d09-92ec-de869c928143",
-        # "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        # "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        # "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        # "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        # "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    calc_list = [
-    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
-    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-    "7efeb4b1-3d19-4762-b163-63328d66303b",
-    "4e6fcf72-daf3-439f-a232-c434ce416af6",
-    "6054afcb-5bab-4702-90a0-b259b5d3217c",
-    "abed40dc-063f-4598-8ba5-9fe749c0615d",
-    "01b269ae-2111-4a07-81fd-3fcd711993b0",
-    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-    "af2b02f7-acee-4be4-8b66-499fab394915",
-    "da1d63b8-fa12-417b-ba18-f748e5f770f3",
-    "636380ea-d5f6-4474-b6ca-b2ed578a20f1",
-    "5ba77536-05c5-4aae-a9ff-6e298d094c3e",
-    "4bc4eaf4-ca5e-4db2-8138-8d4e65af7c0b",
-    "672a1b02-c62f-4ae2-acf0-37f5fb3052b0",
-    "648fe544-16ba-44af-a587-12ccbe280ea6",
-    "8985d1e4-5b99-4711-add4-88949ebb2308",
-    "9e606842-2e27-43bf-b1d1-b43289c9589b",
-    "fcb6e45b-25c4-4087-9483-03d714f473a9",
-    "68c0c5b7-96f3-4e87-92a7-6c1b967fd2d2",
-    "fff629ea-046e-4793-8eec-1a5a15c3eb35",
-    "5c9a206c-bb00-4fb6-bb46-ee675c187df5",
-    "e975ae74-79bd-4672-8d1c-dc841a85781d",
-    "34a6938a-58da-4897-8639-9b90d6db5391",
-    "b5a22759-b4eb-4bf2-aeed-ad14e8615f19",
-    "2f9913a1-51ed-4db6-bfe0-7e1c95b3139e",
-    "2558031e-401d-4579-8e00-3ecf540fb492",
-    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-    "51b11269-2ca8-4b2a-9163-f21758420e78",
-    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-    "3aaa4e37-dc91-482e-99af-132a612d40f3",
-    "37608790-6147-45d0-9f20-1137bb35703d",
-    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-    "d681960f-7bc3-4286-9913-a8812ba3261a",
-    "21df9241-f8d7-4509-b7f1-37e501a823f7",
-    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-    "a01fbce3-2793-461f-ab86-43680ccbae25",
-    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c"
-]
-    # for example_id in calc_list:
-    #     main("libreoffice_calc", example_id)
-
-    impress_list = [
-        # "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        # "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        # "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        # "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        # "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        # "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        # "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        # "0f84bef9-9790-432e-92b7-eece357603fb",
-        # "ce88f674-ab7a-43da-9201-468d38539e4a",
-        # "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        # "a097acff-6266-4291-9fbd-137af7ecd439",
-        # "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    # gimp_list = [
-    #     "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
-    #     "554785e9-4523-4e7a-b8e1-8016f565f56a",
-    #     "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
-    #     "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
-    #     "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
-    #     "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
-    #     "b148e375-fe0b-4bec-90e7-38632b0d73c2",
-    #     "a746add2-cab0-4740-ac36-c3769d9bfb46",
-    #     "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
-    #     "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
-    #     "06ca5602-62ca-47f6-ad4f-da151cde54cc",
-    #     "e2dd0213-26db-4349-abe5-d5667bfd725c",
-    #     "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
-    #     "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
-    #     "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
-    #     "734d6579-c07d-47a8-9ae2-13339795476b"
-    # ]
-    #
-    # for example_id in gimp_list:
-    #     try:
-    #         main("gimp", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-    #
-
-    vs_code_list = [
-        # "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        # "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        # "eabc805a-bfcf-4460-b250-ac92135819f6",
-        # "982d12a5-beab-424f-8d38-d2a48429e511",
-        # "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        # "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        # "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    # multiple_list = [
-    #     "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-    #     "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-    #     "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-    #     "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-    #     "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-    #     "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-    #     "51f5801c-18b3-4f25-b0c3-02f85507a078",
-    #     "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-    #     "510f64c8-9bcc-4be1-8d30-638705850618",
-    #     "937087b6-f668-4ba6-9110-60682ee33441",
-    #     "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-    #     "3680a5ee-6870-426a-a997-eba929a0d25c",
-    #     "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-    #     "58565672-7bfe-48ab-b828-db349231de6b",
-    #     "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    # ]
-    #
-    # for example_id in multiple_list:
-    #     try:
-    #         main("multi_apps", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
--- a/experiment_screenshot_a11y_tree.py
+++ b/experiment_screenshot_a11y_tree.py
@@ -1,361 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu2\Ubuntu2.vmx"
-
-
-# PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx"
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model="gpt-4-vision-preview"):
-    action_space = "pyautogui"
-    # example_class = "libreoffice_calc"
-    # example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
-    # example_id = "01b269ae-2111-4a07-81fd-3fcd711993b0"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-    # example["snapshot"] = "exp_setup4"
-    # example["snapshot"] = "Snapshot 30"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
-                        action_space=action_space, exp="both")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="both")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    os_list = [
-        "94d95f96-9699-4208-98ba-3c3119edf9c2",
-        "bedcedc4-4d72-425e-ad62-21960b11fe0d",
-        "43c2d64c-bab5-4dcb-a30c-b888321c319a",
-        "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
-        "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
-        "f9be0997-4b7c-45c5-b05c-4612b44a6118",
-        "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
-        "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
-        "e0df059f-28a6-4169-924f-b9623e7184cc",
-        "ddc75b62-7311-4af8-bfb3-859558542b36",
-        "b6781586-6346-41cd-935a-a6b1487918fc",
-        "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
-        "a4d98375-215b-4a4d-aee9-3d4370fccc41",
-        "13584542-872b-42d8-b299-866967b5c3ef",
-        "23393935-50c7-4a86-aeea-2b78fd089c5c"
-    ]
-
-    # for example_id in os_list:
-    #     try:
-    #         main("os", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    calc_list = [
-    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
-    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-    "7efeb4b1-3d19-4762-b163-63328d66303b",
-    "4e6fcf72-daf3-439f-a232-c434ce416af6",
-    "6054afcb-5bab-4702-90a0-b259b5d3217c",
-    "abed40dc-063f-4598-8ba5-9fe749c0615d",
-    "01b269ae-2111-4a07-81fd-3fcd711993b0",
-    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-    "af2b02f7-acee-4be4-8b66-499fab394915",
-    "da1d63b8-fa12-417b-ba18-f748e5f770f3",
-    "636380ea-d5f6-4474-b6ca-b2ed578a20f1",
-    "5ba77536-05c5-4aae-a9ff-6e298d094c3e",
-    "4bc4eaf4-ca5e-4db2-8138-8d4e65af7c0b",
-    "672a1b02-c62f-4ae2-acf0-37f5fb3052b0",
-    "648fe544-16ba-44af-a587-12ccbe280ea6",
-    "8985d1e4-5b99-4711-add4-88949ebb2308",
-    "9e606842-2e27-43bf-b1d1-b43289c9589b",
-    "fcb6e45b-25c4-4087-9483-03d714f473a9",
-    "68c0c5b7-96f3-4e87-92a7-6c1b967fd2d2",
-    "fff629ea-046e-4793-8eec-1a5a15c3eb35",
-    "5c9a206c-bb00-4fb6-bb46-ee675c187df5",
-    "e975ae74-79bd-4672-8d1c-dc841a85781d",
-    "34a6938a-58da-4897-8639-9b90d6db5391",
-    "b5a22759-b4eb-4bf2-aeed-ad14e8615f19",
-    "2f9913a1-51ed-4db6-bfe0-7e1c95b3139e",
-    "2558031e-401d-4579-8e00-3ecf540fb492",
-    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-    "51b11269-2ca8-4b2a-9163-f21758420e78",
-    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-    "3aaa4e37-dc91-482e-99af-132a612d40f3",
-    "37608790-6147-45d0-9f20-1137bb35703d",
-    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-    "d681960f-7bc3-4286-9913-a8812ba3261a",
-    "21df9241-f8d7-4509-b7f1-37e501a823f7",
-    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-    "a01fbce3-2793-461f-ab86-43680ccbae25",
-    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c"
-]
-
-    # for example_id in calc_list:
-    #     try:
-    #         main("libreoffice_calc", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    impress_list = [
-        "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        "0f84bef9-9790-432e-92b7-eece357603fb",
-        "ce88f674-ab7a-43da-9201-468d38539e4a",
-        "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        "a097acff-6266-4291-9fbd-137af7ecd439",
-        "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-
-    # for example_id in impress_list:
-    #     try:
-    #         main("libreoffice_impress", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    multiple_list = [
-        "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-        "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-        "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-        "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-        "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-        "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-        "51f5801c-18b3-4f25-b0c3-02f85507a078",
-        "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-        "510f64c8-9bcc-4be1-8d30-638705850618",
-        "937087b6-f668-4ba6-9110-60682ee33441",
-        "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        "3680a5ee-6870-426a-a997-eba929a0d25c",
-        "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-        "58565672-7bfe-48ab-b828-db349231de6b",
-        "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    ]
-
-    # for example_id in multiple_list:
-    #     try:
-    #         main("multi_apps", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    chrome_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-
-    # for example_id in chrome_list:
-    #     try:
-    #         main("chrome", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-
-    writer_list = [
-        "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
-        "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
-        "0e47de2a-32e0-456c-a366-8c607ef7a9d2",
-        "4bcb1253-a636-4df4-8cb0-a35c04dfef31",
-        "0810415c-bde4-4443-9047-d5f70165a697",
-        "e528b65e-1107-4b8c-8988-490e4fece599",
-        "66399b0d-8fda-4618-95c4-bfc6191617e9",
-        "936321ce-5236-426a-9a20-e0e3c5dc536f",
-        "3ef2b351-8a84-4ff2-8724-d86eae9b842e",
-        "0b17a146-2934-46c7-8727-73ff6b6483e8",
-        "0e763496-b6bb-4508-a427-fad0b6c3e195",
-        "f178a4a9-d090-4b56-bc4c-4b72a61a035d",
-        "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
-        "0a0faba3-5580-44df-965d-f562a99b291c",
-        "e246f6d8-78d7-44ac-b668-fcf47946cb50",
-        "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
-        "88fe4b2d-3040-4c70-9a70-546a47764b48",
-        "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
-        "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
-        "6f81754e-285d-4ce0-b59e-af7edb02d108",
-        "b21acd93-60fd-4127-8a43-2f5178f4a830"
-    ]
-
-    for example_id in writer_list:
-        try:
-            main("libreoffice_writer", example_id)
-        except Exception as e:
-            logger.error("An error occurred while running the example: %s", e)
-            continue
-
-
--- a/experiment_screenshot_seeact.py
+++ b/experiment_screenshot_seeact.py
@@ -1,155 +0,0 @@
-import ctypes
-import datetime
-import json
-import logging
-import os
-import sys
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id):
-    action_space = "pyautogui"
-    gpt4_model = "gpt-4-vision-preview"
-    gemini_model = "gemini-pro-vision"
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
-                        action_space=action_space, exp="seeact")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    xx_list = [
-    ]
-    for example_id in xx_list:
-        main("xx", example_id)
--- a/experiment_screenshot_som.py
+++ b/experiment_screenshot_som.py
@@ -1,261 +0,0 @@
-#import ctypes
-import datetime
-import json
-import logging
-import os
-import sys
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id):
-    action_space = "pyautogui"
-    gpt4_model = "gpt-4-vision-preview"
-    gemini_model = "gemini-pro-vision"
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    logger.info("TASK: %s/%s", example_class, example_id)
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, max_tokens=1000, instruction=example['instruction'],
-                        action_space=action_space, exp="som")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    from tqdm import tqdm
-    # impress_list = [
-    #     # "5d901039-a89c-4bfb-967b-bf66f4df075e",
-    #     "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-    #     "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-    #     "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-    #     "c59742c0-4323-4b9d-8a02-723c251deaa0",
-    #     "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-    #     "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-    #     "0f84bef9-9790-432e-92b7-eece357603fb",
-    #     "ce88f674-ab7a-43da-9201-468d38539e4a",
-    #     "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-    #     "a097acff-6266-4291-9fbd-137af7ecd439",
-    #     "bf4e9888-f10f-47af-8dba-76413038b73c",
-    #     "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    # ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    vlc_list = [
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8f080098-ddb1-424c-b438-4e96e5e4786e",
-        "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
-        "fba2c100-79e8-42df-ae74-b592418d54f4",
-        "efcf0d81-0835-4880-b2fd-d866e8bc2294",
-        "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
-        "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
-        "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
-        "9195653c-f4aa-453d-aa95-787f6ccfaae9",
-        "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
-        "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
-        "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
-        "215dfd39-f493-4bc3-a027-8a97d72c61bf"
-    ]
-
-    # for example_id in tqdm(vlc_list):
-    #     try:
-    #         main("vlc", example_id)
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    for example_id in tqdm(chrome_list):
-        try:
-            main("chrome", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    for example_id in tqdm(vs_code_list):
-        try:
-            main("vs_code", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    thunderbird_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-
-    for example_id in tqdm(thunderbird_list):
-        try:
-            main("thunderbird", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -0,0 +1,72 @@
+import datetime
+import json
+import logging
+import os
+# import wandb
+
+from wrapt_timeout_decorator import *
+
+logger = logging.getLogger("desktopenv.experiment")
+
+# Open the JSON file
+with open("./settings.json", "r") as file:
+    # Load the JSON data from the file
+    data = json.load(file)
+time_limit = data["time_limit"]
+
+@timeout(time_limit, use_signals=False)
+def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    agent.reset()
+    obs = env.reset(task_config=example)
+    done = False
+    step_idx = 0
+    env.controller.start_recording()
+    # str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"])
+    while not done and step_idx < max_steps:
+        response, actions = agent.predict(
+            instruction,
+            obs
+        )
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_idx + 1, action)
+            obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                      "wb") as _f:
+                with open(obs['screenshot'], "rb") as __f:
+                    screenshot = __f.read()
+                _f.write(screenshot)
+            # get a11tree and save to wandb
+            thisrun_a11tree = env.controller.get_accessibility_tree()
+            # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"),
+            #                 thisrun_a11tree,
+            #                 response, action, action_timestamp, done)
+            # run.log({"Reward": reward})
+            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_idx + 1,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+            if done:
+                logger.info("The episode is done.")
+                break
+        step_idx += 1
+    # run.log({"str_trajectory": str_table})
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+    # run.log({"Result": result})
--- a/main.py
+++ b/main.py
@@ -47,38 +47,38 @@ def human_agent():
    Runs the Gym environment with human input.
    """
    parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--path', type=str, required=True, help="Path to the virtual machine .vmx file.")
-    parser.add_argument('-s', '--snapshot', type=str, help="Name of the snapshot to restore.")
+    parser.add_argument('-p', '--path', type=str, default=r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu3\Ubuntu3.vmx", help="Path to the virtual machine .vmx file.")
+    parser.add_argument('-s', '--snapshot', type=str, default='init_state', help="Name of the snapshot to restore.")
    parser.add_argument('-e', '--example', type=str, help="Path to the example json file.")
    args = parser.parse_args(sys.argv[1:])

    example_path = args.example if args.example is not None and os.path.exists(args.example) else \
-        'evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json'
-    with open(example_path, "r") as f:
+        'evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json'
+    with open(example_path, "r", encoding="utf-8") as f:
        example = json.load(f)
-    # change to your customized snapshot
-    if args.snapshot is not None: example["snapshot"] = args.snapshot
+        if args.snapshot is not None:
+            example['snapshot'] = args.snapshot

    assert os.path.exists(args.path), "The specified path to the .vmx file does not exist."
    env = DesktopEnv(
        path_to_vm=args.path,
-        action_space="computer_13",
-        task_config=example
+        snapshot_name=args.snapshot,
+        action_space="computer_13"
    )
    # reset the environment to certain snapshot
-    observation = env.reset()
-    logger.info('\x1b[32m[TASK INSTRUCTION]: \x1b[32;3m%s\x1b[0m', example["instruction"])
+    observation = env.reset(task_config=example)
    done = False
+    logger.info('\x1b[32m[TASK INSTRUCTION]: \x1b[32;3m%s\x1b[0m', example["instruction"])

    trajectory = [
-        # {
-        #     "action_type": "MOVE_TO",
-        #     "parameters": {
-        #         "x": 754,
-        #         "y": 1057
-        #     }
-        # },
-        # {"action_type": "CLICK", "parameters": {"button": "right", "num_clicks": 1}}
+        {
+            "action_type": "MOVE_TO",        #
+            "parameters": {
+                "x": 754,
+                "y": 1057
+            }
+        },
+        {"action_type": "CLICK", "parameters": {"button": "right", "num_clicks": 1}}
    ]

    for i in range(len(trajectory)):
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -26,7 +26,7 @@ def find_leaf_nodes(xlm_file_str):

 state_ns = "uri:deskat:state.at-spi.gnome.org"
 component_ns = "uri:deskat:component.at-spi.gnome.org"
-def judge_node(node: ET, platform="ubuntu") -> bool:
+def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
    keeps: bool = node.tag.startswith("document")\
               or node.tag.endswith("item")\
               or node.tag.endswith("button")\
@@ -55,23 +55,25 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
                     or platform=="windows"\
                        and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
                      )\
-                    and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
-                        )\
-                    and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
+                  and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
+                      )\
+                  and ( node.get("name", "") != "" or node.text is not None and len(node.text)>0\
+                     or check_image and node.get("image", "false")=="true"
+                      )

    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
    sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
    keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
    return keeps

-def filter_nodes(root: ET, platform="ubuntu"):
+def filter_nodes(root: ET, platform="ubuntu", check_image=False):
    filtered_nodes = []

    for node in root.iter():
-        if judge_node(node, platform):
+        if judge_node(node, platform, check_image):
            filtered_nodes.append(node)
            #print(ET.tostring(node, encoding="unicode"))

@@ -155,12 +157,12 @@ def print_nodes_with_indent(nodes, indent=0):

 if __name__ == '__main__':
    import json
-    with open('4.json', 'r', encoding='utf-8') as f:
-        xml_file_str = json.load(f)["AT"]
+    with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
+        xml_file_str = f.read()
    filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
    print(len(filtered_nodes))
-    masks = draw_bounding_boxes( filtered_nodes, '4.png'
-                               , '4.a.png'
+    masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
+                               , 'selection_sorted(imaged).ai.png'
                               )

    # print(masks)
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -5,10 +5,10 @@ import os
 import re
 import time
 import uuid
+import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List
-import xml.etree.ElementTree as ET

 import backoff
 import dashscope
@@ -16,20 +16,13 @@ import google.generativeai as genai
 import openai
 import requests
 from PIL import Image
-from openai import (
-    APIConnectionError,
-    APIError,
-    RateLimitError
-)
+from google.api_core.exceptions import InvalidArgument

-from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import filter_nodes, draw_bounding_boxes
 from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
    SYS_PROMPT_IN_A11Y_OUT_CODE, SYS_PROMPT_IN_A11Y_OUT_ACTION, \
    SYS_PROMPT_IN_BOTH_OUT_CODE, SYS_PROMPT_IN_BOTH_OUT_ACTION, \
-    SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
-    SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
-
-import logging
+    SYS_PROMPT_IN_SOM_OUT_TAG

 logger = logging.getLogger("desktopenv.agent")

@@ -41,10 +34,10 @@ def encode_image(image_path):


 def linearize_accessibility_tree(accessibility_tree):
-    #leaf_nodes = find_leaf_nodes(accessibility_tree)
+    # leaf_nodes = find_leaf_nodes(accessibility_tree)
    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))

-    linearized_accessibility_tree = "tag\tname\ttext\tposition\tsize\n"
+    linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
    # Linearize the accessibility tree nodes into a table format

    for node in filtered_nodes:
@@ -72,7 +65,8 @@ def tag_screenshot(screenshot, accessibility_tree):
    uuid_str = str(uuid.uuid4())
    os.makedirs("tmp/images", exist_ok=True)
    tagged_screenshot_file_path = os.path.join("tmp/images", uuid_str + ".png")
-    nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
+    # nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
+    nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
    # Make tag screenshot
    marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)

@@ -168,79 +162,66 @@ def parse_code_from_som_string(input_string, masks):
    return actions


-class GPT4v_Agent:
+class PromptAgent:
    def __init__(
            self,
-            api_key,
-            instruction,
            model="gpt-4-vision-preview",
-            max_tokens=500,
+            max_tokens=1500,
+            top_p=0.9,
+            temperature=0.5,
            action_space="computer_13",
-            exp="screenshot_a11y_tree"
-            # exp can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som", "seeact"]
+            observation_type="screenshot_a11y_tree",
+            # observation_type can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"]
+            max_trajectory_length=3
    ):
-
-        self.instruction = instruction
        self.model = model
        self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.temperature = temperature
        self.action_space = action_space
-        self.exp = exp
-        self.max_trajectory_length = 3
-
-        self.headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}"
-        }
+        self.observation_type = observation_type
+        self.max_trajectory_length = max_trajectory_length

        self.thoughts = []
        self.actions = []
        self.observations = []

-        if exp == "screenshot":
+        if observation_type == "screenshot":
            if action_space == "computer_13":
                self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION
            elif action_space == "pyautogui":
                self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_CODE
            else:
                raise ValueError("Invalid action space: " + action_space)
-        elif exp == "a11y_tree":
+        elif observation_type == "a11y_tree":
            if action_space == "computer_13":
                self.system_message = SYS_PROMPT_IN_A11Y_OUT_ACTION
            elif action_space == "pyautogui":
                self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE
            else:
                raise ValueError("Invalid action space: " + action_space)
-        elif exp == "both":
+        elif observation_type == "screenshot_a11y_tree":
            if action_space == "computer_13":
                self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION
            elif action_space == "pyautogui":
                self.system_message = SYS_PROMPT_IN_BOTH_OUT_CODE
            else:
                raise ValueError("Invalid action space: " + action_space)
-        elif exp == "som":
+        elif observation_type == "som":
            if action_space == "computer_13":
                raise ValueError("Invalid action space: " + action_space)
            elif action_space == "pyautogui":
-                self.system_message = SYS_PROMPT_IN_SOM_A11Y_OUT_TAG
-            else:
-                raise ValueError("Invalid action space: " + action_space)
-        elif exp == "seeact":
-            if action_space == "computer_13":
-                raise ValueError("Invalid action space: " + action_space)
-            elif action_space == "pyautogui":
-                self.system_message = SYS_PROMPT_SEEACT
+                self.system_message = SYS_PROMPT_IN_SOM_OUT_TAG
            else:
                raise ValueError("Invalid action space: " + action_space)
        else:
-            raise ValueError("Invalid experiment type: " + exp)
+            raise ValueError("Invalid experiment type: " + observation_type)

-        self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(
-            self.instruction)
-
-    def predict(self, obs: Dict) -> List:
+    def predict(self, instruction: str, obs: Dict) -> List:
        """
        Predict the next action(s) based on the current observation.
        """
+        system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)

        # Prepare the payload for the API call
        messages = []
@@ -251,7 +232,7 @@ class GPT4v_Agent:
            "content": [
                {
                    "type": "text",
-                    "text": self.system_message
+                    "text": system_message
                },
            ]
        })
@@ -272,7 +253,7 @@ class GPT4v_Agent:
        for previous_obs, previous_action, previous_thought in zip(_observations, _actions, _thoughts):

            # {{{1
-            if self.exp == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                _screenshot = previous_obs["screenshot"]
                _linearized_accessibility_tree = previous_obs["accessibility_tree"]
                logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
@@ -294,18 +275,15 @@ class GPT4v_Agent:
                        }
                    ]
                })
-            elif self.exp in ["som", "seeact"]:
+            elif self.observation_type in ["som"]:
                _screenshot = previous_obs["screenshot"]
-                _linearized_accessibility_tree = previous_obs["accessibility_tree"]
-                logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)

                messages.append({
                    "role": "user",
                    "content": [
                        {
                            "type": "text",
-                            "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                                _linearized_accessibility_tree)
+                            "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
                        },
                        {
                            "type": "image_url",
@@ -316,7 +294,7 @@ class GPT4v_Agent:
                        }
                    ]
                })
-            elif self.exp == "screenshot":
+            elif self.observation_type == "screenshot":
                _screenshot = previous_obs["screenshot"]

                messages.append({
@@ -335,7 +313,7 @@ class GPT4v_Agent:
                        }
                    ]
                })
-            elif self.exp == "a11y_tree":
+            elif self.observation_type == "a11y_tree":
                _linearized_accessibility_tree = previous_obs["accessibility_tree"]

                messages.append({
@@ -349,7 +327,7 @@ class GPT4v_Agent:
                    ]
                })
            else:
-                raise ValueError("Invalid experiment type: " + self.exp)  # 1}}}
+                raise ValueError("Invalid observation_type type: " + self.observation_type)  # 1}}}

            messages.append({
                "role": "assistant",
@@ -362,11 +340,11 @@ class GPT4v_Agent:
            })

        # {{{1
-        if self.exp in ["screenshot", "both"]:
+        if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
            base64_image = encode_image(obs["screenshot"])
            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])

-            if self.exp == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                self.observations.append({
                    "screenshot": base64_image,
                    "accessibility_tree": linearized_accessibility_tree
@@ -383,7 +361,7 @@ class GPT4v_Agent:
                    {
                        "type": "text",
                        "text": "Given the screenshot as below. What's the next step that you will do to help with the task?"
-                        if self.exp == "screenshot"
+                        if self.observation_type == "screenshot"
                        else "Given the screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
                            linearized_accessibility_tree)
                    },
@@ -396,7 +374,7 @@ class GPT4v_Agent:
                    }
                ]
            })
-        elif self.exp == "a11y_tree":
+        elif self.observation_type == "a11y_tree":
            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])

            self.observations.append({
@@ -414,15 +392,13 @@ class GPT4v_Agent:
                    }
                ]
            })
-        elif self.exp == "som":
+        elif self.observation_type == "som":
            # Add som to the screenshot
            masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
            base64_image = encode_image(tagged_screenshot)
-            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])

            self.observations.append({
-                "screenshot": base64_image,
-                "accessibility_tree": linearized_accessibility_tree
+                "screenshot": base64_image
            })

            messages.append({
@@ -430,35 +406,7 @@ class GPT4v_Agent:
                "content": [
                    {
                        "type": "text",
-                        "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                            linearized_accessibility_tree)
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{base64_image}",
-                            "detail": "high"
-                        }
-                    }
-                ]
-            })
-        elif self.exp == "seeact":
-            # Add som to the screenshot
-            masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
-            base64_image = encode_image(tagged_screenshot)
-            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
-
-            self.observations.append({
-                "screenshot": base64_image,
-                "accessibility_tree": linearized_accessibility_tree
-            })
-
-            messages.append({
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": ACTION_DESCRIPTION_PROMPT_SEEACT.format(linearized_accessibility_tree)
+                        "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
                    },
                    {
                        "type": "image_url",
@@ -470,141 +418,244 @@ class GPT4v_Agent:
                ]
            })
        else:
-            raise ValueError("Invalid experiment type: " + self.exp)  # 1}}}
-
-        with open("messages.json", "w") as f:
-            f.write(json.dumps(messages, indent=4))
+            raise ValueError("Invalid observation_type type: " + self.observation_type)  # 1}}}

+        # with open("messages.json", "w") as f:
+        #     f.write(json.dumps(messages, indent=4))

        response = self.call_llm({
            "model": self.model,
            "messages": messages,
-            "max_tokens": self.max_tokens
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "temperature": self.temperature
        })

-        logger.debug("RESPONSE: %s", response)
-
-        if self.exp == "seeact":
-            messages.append({
-                "role": "assistant",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": response
-                    }
-                ]
-            })
-
-            messages.append({
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "{}\n\nWhat's the next step that you will do to help with the task?".format(
-                            ACTION_GROUNDING_PROMPT_SEEACT)
-                    }
-                ]
-            })
-
-            response = self.call_llm({
-                "model": self.model,
-                "messages": messages,
-                "max_tokens": self.max_tokens
-            })
-            print(response)
+        logger.info("RESPONSE: %s", response)

        try:
            actions = self.parse_actions(response, masks)
            self.thoughts.append(response)
-        except Exception as e:
+        except ValueError as e:
            print("Failed to parse action from response", e)
            actions = None
            self.thoughts.append("")

-        return actions
+        return response, actions

    @backoff.on_exception(
        backoff.expo,
-        (APIError, RateLimitError, APIConnectionError),
-        max_tries=10
+        # here you should add more model exceptions as you want,
+        # but you are forbidden to add "Exception", that is, a common type of exception
+        # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
+        (openai.RateLimitError,
+         openai.BadRequestError,
+         openai.InternalServerError,
+         InvalidArgument),
+        max_tries=5
    )
    def call_llm(self, payload):
+
        if self.model.startswith("gpt"):
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+            }
+            logger.info("Generating content with GPT model: %s", self.model)
            response = requests.post(
                "https://api.openai.com/v1/chat/completions",
-                headers=self.headers,
+                headers=headers,
                json=payload
            )

            if response.status_code != 200:
                if response.json()['error']['code'] == "context_length_exceeded":
-                    print("Context length exceeded. Retrying with a smaller context.")
-                    payload["messages"] = payload["messages"][-1:]
+                    logger.error("Context length exceeded. Retrying with a smaller context.")
+                    payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:]
                    retry_response = requests.post(
                        "https://api.openai.com/v1/chat/completions",
-                        headers=self.headers,
+                        headers=headers,
                        json=payload
                    )
                    if retry_response.status_code != 200:
-                        print("Failed to call LLM: " + retry_response.text)
+                        logger.error(
+                            "Failed to call LLM even after attempt on shortening the history: " + retry_response.text)
                        return ""

-                print("Failed to call LLM: " + response.text)
+                logger.error("Failed to call LLM: " + response.text)
                time.sleep(5)
                return ""
            else:
                return response.json()['choices'][0]['message']['content']

-        elif self.model.startswith("mistral"):
-            print("call mistral")
+        elif self.model.startswith("claude"):
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
+
+            claude_messages = []
+
+            for i, message in enumerate(messages):
+                claude_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+                assert len(message["content"]) in [1, 2], "One text, or one text with one image"
+                for part in message["content"]:
+
+                    if part['type'] == "image_url":
+                        image_source = {}
+                        image_source["type"] = "base64"
+                        image_source["media_type"] = "image/png"
+                        image_source["data"] = part['image_url']['url'].replace("data:image/png;base64,", "")
+                        claude_message['content'].append({"type": "image", "source": image_source})
+
+                    if part['type'] == "text":
+                        claude_message['content'].append({"type": "text", "text": part['text']})
+
+                claude_messages.append(claude_message)
+
+            # the claude not support system message in our endpoint, so we concatenate it at the first user message
+            if claude_messages[0]['role'] == "system":
+                claude_system_message_item = claude_messages[0]['content'][0]
+                claude_messages[1]['content'].insert(0, claude_system_message_item)
+                claude_messages.pop(0)
+
+            # headers = {
+            #     "x-api-key": os.environ["ANTHROPIC_API_KEY"],
+            #     "anthropic-version": "2023-06-01",
+            #     "content-type": "application/json"
+            # }
+
+            headers = {
+                "Accept": "application / json",
+                "Authorization": "Bearer " + os.environ["ANTHROPIC_API_KEY"],
+                "User-Agent": "Apifox/1.0.0 (https://apifox.com)",
+                "Content-Type": "application/json"
+            }
+
+            payload = {
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "messages": claude_messages,
+                "temperature": temperature,
+                "top_p": top_p
+            }
+
+            response = requests.post(
+                # "https://chat.claude.com/v1/chat/completions",
+                "https://api.aigcbest.top/v1/chat/completions",
+                headers=headers,
+                json=payload
+            )
+
+            if response.status_code != 200:
+
+                logger.error("Failed to call LLM: " + response.text)
+                time.sleep(5)
+                return ""
+            # else:
+            #     return response.json()['content'][0]['text']
+            else:
+                return response.json()['choices'][0]['message']['content']
+
+
+        elif self.model.startswith("mistral"):
+            print("Call mistral")
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]

            misrtal_messages = []

            for i, message in enumerate(messages):
                mistral_message = {
                    "role": message["role"],
-                    "content": []
+                    "content": ""
                }

                for part in message["content"]:
-                    mistral_message['content'] = part['text'] if part['type'] == "text" else None
+                    mistral_message['content'] = part['text'] if part['type'] == "text" else ""

                misrtal_messages.append(mistral_message)

-            # the mistral not support system message in our endpoint, so we concatenate it at the first user message
-            if misrtal_messages[0]['role'] == "system":
-                misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
-                misrtal_messages.pop(0)
-
            # openai.api_base = "http://localhost:8000/v1"
-            # openai.api_key = "test"
            # response = openai.ChatCompletion.create(
            #     messages=misrtal_messages,
            #     model="Mixtral-8x7B-Instruct-v0.1"
            # )

            from openai import OpenAI
-            TOGETHER_API_KEY = "d011650e7537797148fb6170ec1e0be7ae75160375686fae02277136078e90d2"

-            client = OpenAI(api_key=TOGETHER_API_KEY,
+            client = OpenAI(api_key=os.environ["TOGETHER_API_KEY"],
                            base_url='https://api.together.xyz',
                            )
+            logger.info("Generating content with Mistral model: %s", self.model)

            response = client.chat.completions.create(
                messages=misrtal_messages,
-                model="mistralai/Mixtral-8x7B-Instruct-v0.1",
-                max_tokens=1024
+                model=self.model,
+                max_tokens=max_tokens
            )

            try:
-                # return response['choices'][0]['message']['content']
                return response.choices[0].message.content
            except Exception as e:
                print("Failed to call LLM: " + str(e))
                return ""

+        elif self.model.startswith("THUDM"):
+            # THUDM/cogagent-chat-hf
+            print("Call CogAgent")
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
+
+            cog_messages = []
+
+            for i, message in enumerate(messages):
+                cog_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+
+                for part in message["content"]:
+                    if part['type'] == "image_url":
+                        cog_message['content'].append(
+                            {"type": "image_url", "image_url": {"url": part['image_url']['url']}})
+
+                    if part['type'] == "text":
+                        cog_message['content'].append({"type": "text", "text": part['text']})
+
+                cog_messages.append(cog_message)
+
+            # the cogagent not support system message in our endpoint, so we concatenate it at the first user message
+            if cog_messages[0]['role'] == "system":
+                cog_system_message_item = cog_messages[0]['content'][0]
+                cog_messages[1]['content'].insert(0, cog_system_message_item)
+                cog_messages.pop(0)
+
+            payload = {
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "messages": cog_messages
+            }
+
+            base_url = "http://127.0.0.1:8000"
+
+            response = requests.post(f"{base_url}/v1/chat/completions", json=payload, stream=False)
+            if response.status_code == 200:
+                decoded_line = response.json()
+                content = decoded_line.get("choices", [{}])[0].get("message", "").get("content", "")
+                return content
+            else:
+                print("Failed to call LLM: ", response.status_code)
+                return ""
+
+
        elif self.model.startswith("gemini"):
            def encoded_img_to_pil_img(data_str):
                base64_str = data_str.replace("data:image/png;base64,", "")
@@ -615,6 +666,8 @@ class GPT4v_Agent:

            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]

            gemini_messages = []
            for i, message in enumerate(messages):
@@ -645,24 +698,45 @@ class GPT4v_Agent:
                gemini_messages[1]['parts'][0] = gemini_messages[0]['parts'][0] + "\n" + gemini_messages[1]['parts'][0]
                gemini_messages.pop(0)

-            print(gemini_messages)
+            # since the gemini-pro-vision donnot support multi-turn message
+            if self.model == "gemini-pro-vision":
+                message_history_str = ""
+                for message in gemini_messages:
+                    message_history_str += "<|" + message['role'] + "|>\n" + message['parts'][0] + "\n"
+                gemini_messages = [{"role": "user", "parts": [message_history_str, gemini_messages[-1]['parts'][1]]}]
+                # gemini_messages[-1]['parts'][1].save("output.png", "PNG")
+
+            # print(gemini_messages)
            api_key = os.environ.get("GENAI_API_KEY")
            assert api_key is not None, "Please set the GENAI_API_KEY environment variable"
            genai.configure(api_key=api_key)
+            logger.info("Generating content with Gemini model: %s", self.model)
            response = genai.GenerativeModel(self.model).generate_content(
                gemini_messages,
                generation_config={
-                    "max_output_tokens": max_tokens
+                    "candidate_count": 1,
+                    "max_output_tokens": max_tokens,
+                    "top_p": top_p,
+                    "temperature": temperature
+                },
+                safety_settings={
+                    "harassment": "block_none",
+                    "hate": "block_none",
+                    "sex": "block_none",
+                    "danger": "block_none"
                }
            )

            try:
                return response.text
            except Exception as e:
+                logger.error("Meet exception when calling Gemini API, " + str(e))
                return ""
        elif self.model.startswith("qwen"):
            messages = payload["messages"]
            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]

            qwen_messages = []

@@ -673,13 +747,16 @@ class GPT4v_Agent:
                }
                assert len(message["content"]) in [1, 2], "One text, or one text with one image"
                for part in message["content"]:
-                    qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None
+                    qwen_message['content'].append({"image": part['image_url']['url']}) if part[
+                                                                                               'type'] == "image_url" else None
                    qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None

                qwen_messages.append(qwen_message)

-            response = dashscope.MultiModalConversation.call(model='qwen-vl-plus',
-                                                             messages=messages)
+            response = dashscope.MultiModalConversation.call(
+                model='qwen-vl-plus',
+                messages=messages,  # todo: add the hyperparameters
+            )
            # The response status_code is HTTPStatus.OK indicate success,
            # otherwise indicate request is failed, you can get error code
            # and message from code and message.
@@ -698,7 +775,7 @@ class GPT4v_Agent:

    def parse_actions(self, response: str, masks=None):

-        if self.exp in ["screenshot", "a11y_tree", "both"]:
+        if self.observation_type in ["screenshot", "a11y_tree", "screenshot_a11y_tree"]:
            # parse from the response
            if self.action_space == "computer_13":
                actions = parse_actions_from_string(response)
@@ -710,7 +787,7 @@ class GPT4v_Agent:
            self.actions.append(actions)

            return actions
-        elif self.exp in ["som", "seeact"]:
+        elif self.observation_type in ["som"]:
            # parse from the response
            if self.action_space == "computer_13":
                raise ValueError("Invalid action space: " + self.action_space)
@@ -722,3 +799,8 @@ class GPT4v_Agent:
            self.actions.append(actions)

            return actions
+
+    def reset(self):
+        self.thoughts = []
+        self.actions = []
+        self.observations = []
--- a/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
+++ b/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
@@ -1,401 +0,0 @@
-# --------------------------------------------------------
-# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Xueyan Zou (xueyan@cs.wisc.edu)
-# --------------------------------------------------------
-
-# Define Test/Trainer/Saving
-PIPELINE: XDecoderPipeline
-TRAINER: xdecoder
-SAVE_DIR: '../../data/output/test'
-base_path: "./"
-
-# Resume Logistic
-RESUME: false
-WEIGHT: false
-RESUME_FROM: ''
-EVAL_AT_START: False
-
-# Logging and Debug
-WANDB: False
-LOG_EVERY: 100
-FIND_UNUSED_PARAMETERS: false
-
-# Speed up training
-FP16: false
-PORT: '36873'
-
-# misc
-LOADER:
-  JOINT: False
-  KEY_DATASET: 'coco'
-
-##################
-# Task settings
-##################
-VERBOSE: true
-MODEL:
-  NAME: seem_model_v1
-  HEAD: xdecoder_head
-  MASK_ON: false
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  DIM_PROJ: 512
-  TEXT:
-    ARCH: vlpencoder
-    NAME: transformer
-    TOKENIZER: clip
-    CONTEXT_LENGTH: 77 # 77
-    WIDTH: 512
-    HEADS: 8
-    LAYERS: 12 # 6
-    AUTOGRESSIVE: True
-  BACKBONE:
-    NAME: focal
-    PRETRAINED: ''
-    LOAD_PRETRAINED: false
-    FOCAL:
-      PRETRAIN_IMG_SIZE: 224
-      PATCH_SIZE: 4
-      EMBED_DIM: 192
-      DEPTHS: [2, 2, 18, 2]
-      FOCAL_LEVELS: [4, 4, 4, 4]
-      FOCAL_WINDOWS: [3, 3, 3, 3]
-      DROP_PATH_RATE: 0.3
-      MLP_RATIO: 4.0
-      DROP_RATE: 0.0
-      PATCH_NORM: True
-      USE_CONV_EMBED: True
-      SCALING_MODULATOR: True
-      USE_CHECKPOINT: False
-      USE_POSTLN: true
-      USE_POSTLN_IN_MODULATION: false
-      USE_LAYERSCALE: True
-      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
-      OUT_INDICES: [0, 1, 2, 3]
-  ENCODER:
-    NAME: transformer_encoder_fpn
-    IGNORE_VALUE: 255
-    NUM_CLASSES: 133
-    LOSS_WEIGHT: 1.0
-    CONVS_DIM: 512
-    MASK_DIM: 512
-    NORM: "GN"
-    IN_FEATURES: ["res2", "res3", "res4", "res5"]
-    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
-    COMMON_STRIDE: 4
-    TRANSFORMER_ENC_LAYERS: 6
-  DECODER:
-    NAME: seem_v1
-    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
-    MASK:
-      ENABLED: True
-    DETECTION: False
-    SPATIAL:
-      ENABLED: True
-      MAX_ITER: 1
-    GROUNDING:
-      ENABLED: True
-      MAX_LEN: 5
-      TEXT_WEIGHT: 2.0
-      CLASS_WEIGHT: 0.5
-    RETRIEVAL:
-      ENABLED: False
-    LVIS:
-      ENABLED: True
-      THRES: 0.7
-    OPENIMAGE:
-      ENABLED: False
-      NEGATIVE_SAMPLES: 5
-      GROUNDING:
-        ENABLED: False
-        MAX_LEN: 5
-    CAPTION:
-      ENABLED: False
-      PHRASE_PROB: 0.5
-      SIM_THRES: 0.95
-    DEEP_SUPERVISION: True
-    NO_OBJECT_WEIGHT: 0.1
-    GCLASS_WEIGHT: 0.4
-    GMASK_WEIGHT: 1.0
-    GDICE_WEIGHT: 1.0
-    SCLASS_WEIGHT: 0.4
-    SMASK_WEIGHT: 1.0
-    SDICE_WEIGHT: 1.0
-    OCLASS_WEIGHT: 0.4
-    OMASK_WEIGHT: 1.0
-    ODICE_WEIGHT: 1.0
-    CLASS_WEIGHT: 2.0
-    MASK_WEIGHT: 5.0
-    DICE_WEIGHT: 5.0
-    BBOX_WEIGHT: 5.0
-    GIOU_WEIGHT: 2.0
-    CAPTION_WEIGHT: 2.0
-    COST_SPATIAL:
-      CLASS_WEIGHT: 5.0
-      MASK_WEIGHT: 2.0
-      DICE_WEIGHT: 2.0
-    HIDDEN_DIM: 512
-    NUM_OBJECT_QUERIES: 101
-    NHEADS: 8
-    DROPOUT: 0.0
-    DIM_FEEDFORWARD: 2048
-    MAX_SPATIAL_LEN: [512, 512, 512, 512]
-    # ENC_LAYERS: 0
-    PRE_NORM: False
-    ENFORCE_INPUT_PROJ: False
-    SIZE_DIVISIBILITY: 32
-    TRAIN_NUM_POINTS: 12544
-    OVERSAMPLE_RATIO: 3.0
-    IMPORTANCE_SAMPLE_RATIO: 0.75
-    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
-    TOP_GROUNDING_LAYERS: 10
-    TOP_CAPTION_LAYERS: 10
-    TOP_SPATIAL_LAYERS: 10
-    TOP_OPENIMAGE_LAYERS: 10
-    TEST:
-      SEMANTIC_ON: True
-      INSTANCE_ON: True
-      PANOPTIC_ON: True
-      OVERLAP_THRESHOLD: 0.8
-      OBJECT_MASK_THRESHOLD: 0.8
-      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
-
-# Spatial sampler
-STROKE_SAMPLER:
-  MAX_CANDIDATE: 1
-  CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
-  CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
-  DILATION: 3
-  CIRCLE:
-    NUM_STROKES: 5
-    STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
-    STROKE_PROB: [0.33, 0.33, 0.33]
-  SCRIBBLE:
-    NUM_STROKES: 5
-    STROKE_PRESET: ['rand_curve', 'rand_curve_small']
-    STROKE_PROB: [0.5, 0.5]
-  POINT:
-    NUM_POINTS: 20
-  POLYGON:
-    MAX_POINTS: 9
-  EVAL:
-    MODE: 'best' # best/random/best_random
-    NEGATIVE: False
-    MAX_ITER: 20
-    IOU_ITER: 1
-    GROUNDING: False
-
-# Multi-modal Architecture, order matters
-ATTENTION_ARCH:
-  VARIABLE:
-    queries: ['object', 'grounding', 'spatial']
-    tokens: ['grounding', 'spatial']
-    memories: ['spatial']
-  SELF_ATTENTION:
-    queries:
-      object: ['queries_object']
-      grounding: ['queries_grounding', 'tokens_grounding']
-      spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
-    tokens:
-      grounding: ['queries_grounding', 'tokens_grounding']
-      spatial: ['tokens_spatial']
-    memories:
-      spatial: ['memories_spatial']
-  CROSS_ATTENTION:
-    queries:
-      object: True
-      grounding: True
-      spatial: True
-    memories:
-      spatial: True
-    tokens:
-      grounding: False
-      spatial: False
-  MASKING: ['tokens_spatial', 'tokens_grounding']
-  DUPLICATION:
-    queries:
-      grounding: 'queries_object'
-      spatial: 'queries_object'
-  SPATIAL_MEMORIES: 32
-  QUERY_NUMBER: 3
-
-DATASETS:
-  TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
-  # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
-  TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"]  # to evaluate instance and semantic performance as well
-  # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  CLASS_CONCAT: false
-  SIZE_DIVISIBILITY: 32
-  PROPOSAL_FILES_TRAIN: []
-
-INPUT:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-TRAIN:
-  ASPECT_RATIO_GROUPING: true
-  BATCH_SIZE_TOTAL: 4
-  BATCH_SIZE_PER_GPU: 4
-  SHUFFLE: true
-
-TEST:
-  DETECTIONS_PER_IMAGE: 100
-  NAME: coco_eval
-  IOU_TYPE: ['bbox', 'segm']
-  USE_MULTISCALE: false
-  BATCH_SIZE_TOTAL: 8
-  MODEL_FILE: ''
-  AUG:
-    ENABLED: False
-
-DATALOADER:
-  FILTER_EMPTY_ANNOTATIONS: False
-  NUM_WORKERS: 8
-  LOAD_PROPOSALS: False
-  SAMPLER_TRAIN: "TrainingSampler"
-  ASPECT_RATIO_GROUPING: True
-
-COCO:
-  INPUT:
-    MIN_SIZE_TRAIN: 800
-    MAX_SIZE_TRAIN: 1333
-    MIN_SIZE_TRAIN_SAMPLING: 'choice'
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.1
-    MAX_SCALE: 2.0
-    DATASET_MAPPER_NAME: "coco_interactive"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'coco'
-
-# Validation dataset
-ADE20K:
-  INPUT:
-    MIN_SIZE_TRAIN: 640
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 640
-    MAX_SIZE_TRAIN: 2560
-    MAX_SIZE_TEST: 2560
-    MASK_FORMAT: "polygon"
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: (640, 640)
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    FORMAT: "RGB"
-  DATASET:
-    DATASET: 'ade'
-
-SBD:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 1
-
-VOC:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-DAVIS:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-VOS:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 1
-
-REF:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-    FORMAT: "RGB"
-    SPATIAL: False
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 4
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-# Detectron2 training config for optimizer and lr scheduler
-SOLVER:
-  BASE_LR: 0.0001
-  STEPS: [0.88889, 0.96296]
-  MAX_ITER: 1
-  GAMMA: 0.1
-  WARMUP_FACTOR: 1.0
-  WARMUP_ITERS: 10
-  WARMUP_METHOD: "linear"
-  WEIGHT_DECAY: 0.05
-  OPTIMIZER: "ADAMW"
-  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
-  LR_MULTIPLIER:
-    backbone: 0.1
-    lang_encoder: 0.1
-  FIX_PARAM:
-    backbone: True
-    lang_encoder: True
-    pixel_decoder: True
-  WEIGHT_DECAY_NORM: 0.0
-  WEIGHT_DECAY_EMBED: 0.0
-  CLIP_GRADIENTS:
-    ENABLED: True
-    CLIP_TYPE: "full_model"
-    CLIP_VALUE: 5.0 # 0.01
-    NORM_TYPE: 2.0
-  MAX_NUM_EPOCHS: 50
--- a/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
+++ b/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
@@ -1,524 +0,0 @@
-# ------------------------------------------------------------------------
-# Semantic SAM
-# Copyright (c) MicroSoft, Inc. and its affiliates.
-# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
-# ------------------------------------------------------------------------
-
-##################
-# Task settings
-##################
-WEIGHT: ''
-PORT: 53711
-VERBOSE: true
-
-OUTPUT_DIR: '../../data/output/test'
-# misc
-LOADER:
-  JOINT: True
-  KEY_DATASET: 'coco'
-# model
-MODEL:
-  NAME: interactive_mask_dino
-  HEAD: general_head
-  MASK_ON: false
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  DIM_PROJ: 512
-  BACKBONE_DIM: 768
-  BACKGROUND: False
-  WEIGHTS: ''
-  TEXT:
-    ARCH: noencoder  # no language encoder for training only sa-1b data
-    NAME: transformer
-    TOKENIZER: clip
-    CONTEXT_LENGTH: 18 # 77
-    WIDTH: 512
-    HEADS: 8
-    LAYERS: 12 # 6
-    AUTOGRESSIVE: True
-  BACKBONE:
-    NAME: swin
-    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
-    LOAD_PRETRAINED: true
-    SWIN:
-      PRETRAIN_IMG_SIZE: 384
-      PATCH_SIZE: 4
-      EMBED_DIM: 192
-      DEPTHS: [ 2, 2, 18, 2 ]
-      NUM_HEADS: [ 6, 12, 24, 48 ]
-      WINDOW_SIZE: 12
-      MLP_RATIO: 4.0
-      QKV_BIAS: true
-      QK_SCALE: ~
-      DROP_RATE: 0.0
-      ATTN_DROP_RATE: 0.0
-      DROP_PATH_RATE: 0.3
-      APE: false
-      PATCH_NORM: true
-      USE_CHECKPOINT: false
-      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
-  ENCODER:
-    NAME: encoder_deform
-    IGNORE_VALUE: 255
-    NUM_CLASSES: 1
-    LOSS_WEIGHT: 1.0
-    CONVS_DIM: 256
-    MASK_DIM: 256
-    NORM: "GN"
-    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
-    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
-    COMMON_STRIDE: 4
-    TRANSFORMER_ENC_LAYERS: 6
-    TOTAL_NUM_FEATURE_LEVELS: 4
-    NUM_FEATURE_LEVELS: 3
-    FEATURE_ORDER: "low2high"
-  DECODER:
-    NAME: interactive_mask_dino
-    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
-    MASK: True
-    BOX: True
-    PART: True
-    GROUNDING:
-      ENABLED: False
-      MAX_LEN: 5
-      TEXT_WEIGHT: 2.0
-      CLASS_WEIGHT: 0.5
-    CAPTION:
-      ENABLED: False
-      PHRASE_PROB: 0.0
-      SIM_THRES: 0.95
-    CAPTIONING:
-      ENABLED: False
-      STEP: 50
-    RETRIEVAL:
-      ENABLED: False
-      DIM_IMG: 768
-      ENSEMBLE: True
-    OPENIMAGE:
-      ENABLED: False
-      NEGATIVE_SAMPLES: 5
-      GROUNDING:
-        ENABLED: False
-        MAX_LEN: 5
-    DEEP_SUPERVISION: True
-    NO_OBJECT_WEIGHT: 0.1
-    CLASS_WEIGHT: 4.0
-    MASK_WEIGHT: 5.0
-    DICE_WEIGHT: 5.0
-    BOX_WEIGHT: 5.0
-    GIOU_WEIGHT: 2.0
-    IOU_WEIGHT: 1.0
-    COST_CLASS_WEIGHT: 4.0
-    COST_DICE_WEIGHT: 5.0
-    COST_MASK_WEIGHT: 5.0
-    COST_BOX_WEIGHT: 5.0
-    COST_GIOU_WEIGHT: 2.0
-    HIDDEN_DIM: 256
-    NUM_OBJECT_QUERIES: 0
-    NHEADS: 8
-    DROPOUT: 0.0
-    DIM_FEEDFORWARD: 2048
-    ENC_LAYERS: 0
-    PRE_NORM: False
-    ENFORCE_INPUT_PROJ: False
-    SIZE_DIVISIBILITY: 32
-    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
-    TRAIN_NUM_POINTS: 12544
-    OVERSAMPLE_RATIO: 3.0
-    IMPORTANCE_SAMPLE_RATIO: 0.75
-    TWO_STAGE: False
-    INITIALIZE_BOX_TYPE: 'no'
-    DN: seg
-    DN_NOISE_SCALE: 0.4
-    DN_NUM: 100
-    INITIAL_PRED: False
-    LEARN_TGT: False
-    TOTAL_NUM_FEATURE_LEVELS: 4
-    SEMANTIC_CE_LOSS: False
-    PANO_BOX_LOSS: False
-    COCO: False
-    O365: False
-    SAM: True
-    PASCAL: False
-    RE_POINT: True
-    NUM_INTERACTIVE_TOKENS: 6
-    MAX_NUM_INSTANCE: 60
-    TEST:
-      SEMANTIC_ON: True
-      INSTANCE_ON: True
-      PANOPTIC_ON: True
-      BOX_INTERACTIVE: False
-      CLASSIFICATION_ON: False
-      OVERLAP_THRESHOLD: 0.8
-      OBJECT_MASK_THRESHOLD: 0.25
-      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
-      TEST_FOUCUS_ON_BOX: False
-      PANO_TRANSFORM_EVAL: True
-      PANO_TEMPERATURE: 0.06
-
-TEST:
-  EVAL_PERIOD: 500000
-  PRECISE_BN:
-    NUM_ITER: 1
-    ENABLED: False
-  AUG:
-    ENABLED: False
-
-SAM:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.99
-    MAX_SCALE: 1.01
-    DATASET_MAPPER_NAME: "sam"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'sam'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 4
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-COCO:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.1
-    MAX_SCALE: 2.0
-    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'coco'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 1
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 2
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-VLP:
-  INPUT:
-    IMAGE_SIZE: 224
-    DATASET_MAPPER_NAME: "vlpretrain"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  TRAIN:
-    BATCH_SIZE_TOTAL: 2
-    BATCH_SIZE_PER_GPU: 2
-  TEST:
-    BATCH_SIZE_TOTAL: 256
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 16
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-INPUT:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-DATASETS:
-  TRAIN: ["sam_train"]
-  # interactive segmentation evaluation.
-  TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
-#  TEST: ["sam_minival"]
-
-  CLASS_CONCAT: false
-  SIZE_DIVISIBILITY: 32
-  PROPOSAL_FILES_TRAIN: []
-
-DATALOADER:
-  FILTER_EMPTY_ANNOTATIONS: False
-  NUM_WORKERS: 16
-  LOAD_PROPOSALS: False
-  SAMPLER_TRAIN: "TrainingSampler"
-  ASPECT_RATIO_GROUPING: True
-
-# Detectron2 training config for optimizer and lr scheduler
-SOLVER:
-  BASE_LR_END: 0.0
-  MOMENTUM: 0.9
-  NESTEROV: False
-  CHECKPOINT_PERIOD: 5000
-  IMS_PER_BATCH: 1
-  REFERENCE_WORLD_SIZE: 0
-  BIAS_LR_FACTOR: 1.0
-  WEIGHT_DECAY_BIAS: None
-  # original
-  BASE_LR: 0.0001
-  STEPS: [327778, 355092]
-  MAX_ITER: 368750
-  GAMMA: 0.1
-  WARMUP_FACTOR: 1.0
-  WARMUP_ITERS: 10
-  WARMUP_METHOD: "linear"
-  WEIGHT_DECAY: 0.05
-  OPTIMIZER: "ADAMW"
-  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
-  LR_MULTIPLIER:
-    backbone: 0.1
-    lang_encoder: 0.1
-  WEIGHT_DECAY_NORM: 0.0
-  WEIGHT_DECAY_EMBED: 0.0
-  CLIP_GRADIENTS:
-    ENABLED: True
-    CLIP_TYPE: "full_model"
-    CLIP_VALUE: 0.01
-    NORM_TYPE: 2.0
-  AMP:
-    ENABLED: True
-
-# Evaluation Dataset
-ADE20K:
-  INPUT:
-    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 640
-    MAX_SIZE_TRAIN: 2560
-    MAX_SIZE_TEST: 2560
-    MASK_FORMAT: "polygon"
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: [640, 640]
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    FORMAT: "RGB"
-  DATASET:
-    DATASET: 'ade'
-  TRAIN:
-    ASPECT_RATIO_GROUPING: true
-    BATCH_SIZE_TOTAL: 16
-    BATCH_SIZE_PER_GPU: 2
-    SHUFFLE: true
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 8
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-#ADE20K:
-#  INPUT:
-#    MIN_SIZE_TRAIN: 640
-#    MIN_SIZE_TRAIN_SAMPLING: "choice"
-#    MIN_SIZE_TEST: 640
-#    MAX_SIZE_TRAIN: 2560
-#    MAX_SIZE_TEST: 2560
-#    MASK_FORMAT: "polygon"
-#    CROP:
-#      ENABLED: True
-#      TYPE: "absolute"
-#      SIZE: (640, 640)
-#      SINGLE_CATEGORY_MAX_AREA: 1.0
-#    COLOR_AUG_SSD: True
-#    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-#    DATASET_MAPPER_NAME: "mask_former_panoptic"
-#    FORMAT: "RGB"
-#  DATASET:
-#    DATASET: 'ade'
-#  TEST:
-#    BATCH_SIZE_TOTAL: 8
-
-
-REF:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-    FORMAT: "RGB"
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-SUN:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-SCAN:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-BDD:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-CITY:
-  INPUT:
-    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 1024
-    MAX_SIZE_TRAIN: 4096
-    MAX_SIZE_TEST: 2048
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: [ 512, 1024 ]
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: -1
-    FORMAT: "RGB"
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    MASK_FORMAT: "polygon"
-    TEST:
-      EVAL_PERIOD: 5000
-      BATCH_SIZE_TOTAL: 1
-      AUG:
-        ENABLED: False
-        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
-        MAX_SIZE: 4096
-        FLIP: True
-    DATALOADER:
-      FILTER_EMPTY_ANNOTATIONS: True
-      NUM_WORKERS: 2
-      LOAD_PROPOSALS: False
-      SAMPLER_TRAIN: "TrainingSampler"
-      ASPECT_RATIO_GROUPING: True
-    TRAIN:
-      ASPECT_RATIO_GROUPING: true
-      BATCH_SIZE_TOTAL: 2
-      BATCH_SIZE_PER_GPU: 2
-      SHUFFLE: true
-
-PSACAL_PART:
-  INPUT:
-      MIN_SIZE_TEST: 800
-      MAX_SIZE_TEST: 1333
-      IMAGE_SIZE: 1024
-      MIN_SCALE: 0.1
-      MAX_SCALE: 2.0
-      DATASET_MAPPER_NAME: "pascal_part_lsj"
-      IGNORE_VALUE: 255
-      COLOR_AUG_SSD: False
-      SIZE_DIVISIBILITY: 32
-      RANDOM_FLIP: "horizontal"
-      MASK_FORMAT: "polygon"
-      FORMAT: "RGB"
-      CROP:
-        ENABLED: True
-  MODEL:
-    MASK_ON: True
-    KEYPOINT_ON: False
-    LOAD_PROPOSALS: False
-  # DATASET:
-  #   DATASET: 'coco'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 2
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
--- a/mm_agents/llm_server/CogAgent/CogAgent.py
+++ b/mm_agents/llm_server/CogAgent/CogAgent.py
@@ -0,0 +1,405 @@
+import os
+import gc
+import time
+import base64
+
+from contextlib import asynccontextmanager
+from typing import List, Literal, Union, Tuple, Optional
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoModelForCausalLM, LlamaTokenizer, PreTrainedModel, PreTrainedTokenizer, \
+    TextIteratorStreamer
+from PIL import Image
+from io import BytesIO
+
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/cogvlm-chat-hf')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", 'lmsys/vicuna-7b-v1.5')
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+if os.environ.get('QUANT_ENABLED'):
+    QUANT_ENABLED = True
+else:
+    with torch.cuda.device(DEVICE):
+        __, total_bytes = torch.cuda.mem_get_info()
+        total_gb = total_bytes / (1 << 30)
+        if total_gb < 40:
+            QUANT_ENABLED = True
+        else:
+            QUANT_ENABLED = False
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    An asynchronous context manager for managing the lifecycle of the FastAPI app.
+    It ensures that GPU memory is cleared after the app's lifecycle ends, which is essential for efficient resource management in GPU environments.
+    """
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+class ModelCard(BaseModel):
+    """
+    A Pydantic model representing a model card, which provides metadata about a machine learning model.
+    It includes fields like model ID, owner, and creation time.
+    """
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+
+
+class ImageUrl(BaseModel):
+    url: str
+
+
+class TextContent(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ImageUrlContent(BaseModel):
+    type: Literal["image_url"]
+    image_url: ImageUrl
+
+
+ContentItem = Union[TextContent, ImageUrlContent]
+
+
+class ChatMessageInput(BaseModel):
+    role: Literal["user", "assistant", "system"]
+    content: Union[str, List[ContentItem]]
+    name: Optional[str] = None
+
+
+class ChatMessageResponse(BaseModel):
+    role: Literal["assistant"]
+    content: str = None
+    name: Optional[str] = None
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessageInput]
+    temperature: Optional[float] = 0.8
+    top_p: Optional[float] = 0.8
+    max_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+    # Additional parameters
+    repetition_penalty: Optional[float] = 1.0
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessageResponse
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+
+
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    usage: Optional[UsageInfo] = None
+
+
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """
+    An endpoint to list available models. It returns a list of model cards.
+    This is useful for clients to query and understand what models are available for use.
+    """
+    model_card = ModelCard(id="cogvlm-chat-17b")  # can be replaced by your model id like cogagent-chat-18b
+    return ModelList(data=[model_card])
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+
+    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
+        raise HTTPException(status_code=400, detail="Invalid request")
+
+    gen_params = dict(
+        messages=request.messages,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens or 1024,
+        echo=False,
+        stream=request.stream,
+    )
+
+    if request.stream:
+        generate = predict(request.model, gen_params)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+    response = generate_cogvlm(model, tokenizer, gen_params)
+
+    usage = UsageInfo()
+
+    message = ChatMessageResponse(
+        role="assistant",
+        content=response["text"],
+    )
+    logger.debug(f"==== message ====\n{message}")
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=message,
+    )
+    task_usage = UsageInfo.model_validate(response["usage"])
+    for usage_key, usage_value in task_usage.model_dump().items():
+        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
+
+
+async def predict(model_id: str, params: dict):
+    """
+    Handle streaming predictions. It continuously generates responses for a given input stream.
+    This is particularly useful for real-time, continuous interactions with the model.
+    """
+
+    global model, tokenizer
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(role="assistant"),
+        finish_reason=None
+    )
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    previous_text = ""
+    for new_response in generate_stream_cogvlm(model, tokenizer, params):
+        decoded_unicode = new_response["text"]
+        delta_text = decoded_unicode[len(previous_text):]
+        previous_text = decoded_unicode
+        delta = DeltaMessage(
+            content=delta_text,
+            role="assistant",
+        )
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0,
+            delta=delta,
+        )
+        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(),
+    )
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+
+def generate_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    """
+    Generates a response using the CogVLM model. It processes the chat history and image data, if any,
+    and then invokes the model to generate a response.
+    """
+
+    for response in generate_stream_cogvlm(model, tokenizer, params):
+        pass
+    return response
+
+
+def process_history_and_images(messages: List[ChatMessageInput]) -> Tuple[
+    Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
+    """
+    Process history messages to extract text, identify the last user query,
+    and convert base64 encoded image URLs to PIL images.
+
+    Args:
+        messages(List[ChatMessageInput]): List of ChatMessageInput objects.
+    return: A tuple of three elements:
+             - The last user query as a string.
+             - Text history formatted as a list of tuples for the model.
+             - List of PIL Image objects extracted from the messages.
+    """
+    formatted_history = []
+    image_list = []
+    last_user_query = ''
+
+    for i, message in enumerate(messages):
+        role = message.role
+        content = message.content
+
+        if isinstance(content, list):  # text
+            text_content = ' '.join(item.text for item in content if isinstance(item, TextContent))
+        else:
+            text_content = content
+
+        if isinstance(content, list):  # image
+            for item in content:
+                if isinstance(item, ImageUrlContent):
+                    image_url = item.image_url.url
+                    if image_url.startswith("data:image/jpeg;base64,"):
+                        base64_encoded_image = image_url.split("data:image/jpeg;base64,")[1]
+                        image_data = base64.b64decode(base64_encoded_image)
+                        image = Image.open(BytesIO(image_data)).convert('RGB')
+                        image_list.append(image)
+                    elif image_url.startswith("data:image/png;base64,"):
+                        base64_encoded_image = image_url.split("data:image/png;base64,")[1]
+                        image_data = base64.b64decode(base64_encoded_image)
+                        image = Image.open(BytesIO(image_data)).convert('RGB')
+                        image_list.append(image)
+
+        if role == 'user':
+            if i == len(messages) - 1:  # 最后一条用户消息
+                last_user_query = text_content
+            else:
+                formatted_history.append((text_content, ''))
+        elif role == 'assistant':
+            if formatted_history:
+                if formatted_history[-1][1] != '':
+                    assert False, f"the last query is answered. answer again. {formatted_history[-1][0]}, {formatted_history[-1][1]}, {text_content}"
+                formatted_history[-1] = (formatted_history[-1][0], text_content)
+            else:
+                assert False, f"assistant reply before user"
+        else:
+            assert False, f"unrecognized role: {role}"
+
+    return last_user_query, formatted_history, image_list
+
+
+@torch.inference_mode()
+def generate_stream_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    """
+    Generates a stream of responses using the CogVLM model in inference mode.
+    It's optimized to handle continuous input-output interactions with the model in a streaming manner.
+    """
+    messages = params["messages"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_tokens", 256))
+    query, history, image_list = process_history_and_images(messages)
+
+    logger.debug(f"==== request ====\n{query}")
+
+    input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history,
+                                                        images=[image_list[-1]])
+    inputs = {
+        'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
+        'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
+        'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
+        'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
+    }
+    if 'cross_images' in input_by_model and input_by_model['cross_images']:
+        inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
+
+    input_echo_len = len(inputs["input_ids"][0])
+    streamer = TextIteratorStreamer(
+        tokenizer=tokenizer,
+        timeout=60.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+)
+    gen_kwargs = {
+        "repetition_penalty": repetition_penalty,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p if temperature > 1e-5 else 0,
+        'streamer': streamer,
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+
+    total_len = 0
+    generated_text = ""
+    with torch.no_grad():
+        model.generate(**inputs, **gen_kwargs)
+        for next_text in streamer:
+            generated_text += next_text
+            yield {
+                "text": generated_text,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": total_len - input_echo_len,
+                    "total_tokens": total_len,
+                },
+            }
+    ret = {
+        "text": generated_text,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+    }
+    yield ret
+
+
+gc.collect()
+torch.cuda.empty_cache()
+
+if __name__ == "__main__":
+    tokenizer = LlamaTokenizer.from_pretrained(
+        TOKENIZER_PATH,
+        trust_remote_code=True)
+
+    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
+        torch_type = torch.bfloat16
+    else:
+        torch_type = torch.float16
+
+    print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))
+
+    if 'cuda' in DEVICE:
+        if QUANT_ENABLED:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH,
+                load_in_4bit=True,
+                trust_remote_code=True,
+                torch_dtype=torch_type,
+                low_cpu_mem_usage=True
+            ).eval()
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH,
+                load_in_4bit=False,
+                trust_remote_code=True,
+                torch_dtype=torch_type,
+                low_cpu_mem_usage=True
+            ).to(DEVICE).eval()
+            
+    else:
+        model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
--- a/mm_agents/llm_server/CogAgent/README.md
+++ b/mm_agents/llm_server/CogAgent/README.md
@@ -0,0 +1,7 @@
+## Deploy CogAgent as server
+
+```
+python CogAgent.py
+```
+
+The CogAgent LLM will be deployed on http://127.0.0.1:8000
--- a/mm_agents/ops/functions/init.py
+++ b/mm_agents/ops/functions/init.py
@@ -1,13 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from .ms_deform_attn_func import MSDeformAttnFunction
-
--- a/mm_agents/ops/functions/ms_deform_attn_func.py
+++ b/mm_agents/ops/functions/ms_deform_attn_func.py
@@ -1,72 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-try:
-    import MultiScaleDeformableAttention as MSDA
-except ModuleNotFoundError as e:
-    info_string = (
-        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
-        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
-        "\t`sh make.sh`\n"
-    )
-    raise ModuleNotFoundError(info_string)
-
-
-class MSDeformAttnFunction(Function):
-    @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
-        ctx.im2col_step = im2col_step
-        output = MSDA.ms_deform_attn_forward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
-        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = \
-            MSDA.ms_deform_attn_backward(
-                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
-    # for debug and test only,
-    # need to use cuda version instead
-    N_, S_, M_, D_ = value.shape
-    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
-                                          mode='bilinear', padding_mode='zeros', align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
-    return output.transpose(1, 2).contiguous()
--- a/Show More
+++ b/Show More