diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 0000000..cf0e7fc
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,19 @@
+{
+    // Use IntelliSense to learn about possible attributes.
+    // Hover to view descriptions of existing attributes.
+    // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Python Debugger: Current File with Arguments",
+            "type": "debugpy",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "args": [
+                "--path_to_vm", "/Users/lxc/Virtual Machines.localized/DesktopEnv-Ubuntu 64-bit Arm.vmwarevm/DesktopEnv-Ubuntu 64-bit Arm.vmx"
+                // "--example_time_limit", "60"
+            ]
+        }
+    ]
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 8eb867f..6262044 100644
--- a/README.md
+++ b/README.md
@@ -21,10 +21,12 @@
 Please refer to [guidance](https://docs.google.com/document/d/1KBdeZwmZs2Vi_Wsnngb3Wf1-RiwMMpXTftwMqP2Ztak/edit#heading=h.uh0x0tkl7fuw)
 
 2. Install the environment package, download the examples and the virtual machine image.
+For x86_64 Linux or Windows, you can install the environment package and download the examples and the virtual machine image by running the following commands:
 ```bash
 pip install desktop-env
 gdown xxxx
-gdown xxxx
+vmrun -T ws start "Ubuntu/Ubuntu.vmx" nogui
+vmrun -T ws snapshot "Ubuntu/Ubuntu.vmx" "init_state"
 ```
 
 ## Quick Start
diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 60a4bb4..4159cde 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -263,16 +263,19 @@ class PythonController:
         """
         Ends recording the screen.
         """
-        response = requests.post(self.http_server + "/end_recording")
-        if response.status_code == 200:
-            logger.info("Recording stopped successfully")
-            with open(dest, 'wb') as f:
-                for chunk in response.iter_content(chunk_size=8192):
-                    if chunk:
-                        f.write(chunk)
-        else:
-            logger.error("Failed to stop recording. Status code: %d", response.status_code)
-            return None
+        try:
+            response = requests.post(self.http_server + "/end_recording")
+            if response.status_code == 200:
+                logger.info("Recording stopped successfully")
+                with open(dest, 'wb') as f:
+                    for chunk in response.iter_content(chunk_size=8192):
+                        if chunk:
+                            f.write(chunk)
+            else:
+                logger.error("Failed to stop recording. Status code: %d", response.status_code)
+                return None
+        except Exception as e:
+            logger.error("An error occurred while trying to download the recording: %s", e)
 
     # Additional info
     def get_vm_platform(self):
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 2454ce6..fee3f37 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -53,8 +53,8 @@ class DesktopEnv(gym.Env):
     def __init__(
             self,
             path_to_vm: str,
+            snapshot_name: str = "init_state",
             action_space: str = "computer_13",
-            task_config: Dict[str, Any] = None,
             tmp_dir: str = "tmp",
             cache_dir: str = "cache",
             screen_size: Tuple[int] = (1920, 1080),
@@ -64,15 +64,6 @@ class DesktopEnv(gym.Env):
         Args:
             path_to_vm (str): path to .vmx file
             action_space (str): "computer_13" | "pyautogui"
-
-            task_config (Dict[str, Any]): manages task configs integratedly,
-              including
-              * base snapshot
-              * task id (uuid)
-              * instruction
-              * setup config
-              * evaluator config
-
             tmp_dir (str): temporary directory to store trajectory stuffs like
               the extracted screenshots
             cache_dir (str): cache directory to cache task-related stuffs like
@@ -81,23 +72,20 @@ class DesktopEnv(gym.Env):
 
         # Initialize environment variables
         self.path_to_vm = os.path.abspath(os.path.expandvars(os.path.expanduser(path_to_vm)))
+        self.snapshot_name = snapshot_name
         self.tmp_dir_base: str = tmp_dir
         self.cache_dir_base: str = cache_dir
-        self.vm_screen_size = screen_size
+        self.vm_screen_size = screen_size  # todo: add the logic to get the screen size from the VM
         self.headless = headless
 
         os.makedirs(self.tmp_dir_base, exist_ok=True)
 
-        # task-aware stuffs
-        # todo: handling the logic of snapshot directory
-        self._set_task_info(task_config)
-
         # Initialize emulator and controller
         logger.info("Initializing...")
         self._start_emulator()
         self.vm_ip = self._get_vm_ip()
         self.controller = PythonController(vm_ip=self.vm_ip)
-        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir)
+        self.setup_controller = SetupController(vm_ip=self.vm_ip, cache_dir=self.cache_dir_base)
 
         # Meta info of the VM, move to the reset() function
         self.vm_platform: str = ""  # self.controller.get_vm_platform()
@@ -147,7 +135,7 @@ class DesktopEnv(gym.Env):
         raise Exception("Failed to get VM IP address!")
 
     def _save_state(self):
-        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
+        _execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_name])
 
     def _get_screenshot(self):
         # random_uuid = str(uuid.uuid4())
@@ -167,7 +155,6 @@ class DesktopEnv(gym.Env):
         return screenshot_image_path
 
     def _set_task_info(self, task_config: Dict[str, Any]):
-        self.snapshot_path = task_config["snapshot"]
         self.task_id: str = task_config["id"]
         self.cache_dir: str = os.path.join(self.cache_dir_base, self.task_id)
         os.makedirs(self.cache_dir, exist_ok=True)
@@ -187,7 +174,7 @@ class DesktopEnv(gym.Env):
             if isinstance(self.evaluator["func"], list) \
             else getattr(metrics, self.evaluator["func"])
         self.metric_conj: str = self.evaluator.get("conj", "and")  # take conjunction of multiple metrics
-        if "result" in self.evaluator:
+        if "result" in self.evaluator and len(self.evaluator["result"])>0:
             self.result_getter: Getter = [getattr(getters, "get_{:}".format(res["type"])) for res in
                                           self.evaluator["result"]] \
                 if isinstance(self.evaluator["result"], list) \
@@ -197,7 +184,7 @@ class DesktopEnv(gym.Env):
                 if isinstance(self.metric, list) \
                 else None
 
-        if "expected" in self.evaluator:
+        if "expected" in self.evaluator and len(self.evaluator["expected"])>0:
             self.expected_getter: Getter = [getattr(getters, "get_{:}".format(exp["type"])) if exp else None for exp in
                                             self.evaluator["expected"]] \
                 if isinstance(self.evaluator["expected"], list) \
@@ -239,8 +226,8 @@ class DesktopEnv(gym.Env):
         )
         os.makedirs(os.path.join(self.tmp_dir, "screenshots"))
 
-        logger.info("Reverting to snapshot to {}...".format(self.snapshot_path))
-        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        logger.info("Reverting to snapshot to {}...".format(self.snapshot_name))
+        _execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_name])
         time.sleep(5)
 
         print(self.vm_screen_size)
diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py
index c3f7f85..61bb025 100644
--- a/desktop_env/evaluators/metrics/__init__.py
+++ b/desktop_env/evaluators/metrics/__init__.py
@@ -114,7 +114,8 @@ from .slides import (
 )
 from .table import (
     compare_table,
-    compare_csv
+    compare_csv,
+    compare_conference_city_in_order
 )
 from .thunderbird import (
     check_thunderbird_prefs,
@@ -148,7 +149,6 @@ from .vscode import (
     check_html_background_image,
     compare_zip_files
 )
-from .calc import compare_conference_city_in_order
 from .others import compare_epub, check_mp3_meta
 
 def infeasible():
diff --git a/desktop_env/evaluators/metrics/basic_os.py b/desktop_env/evaluators/metrics/basic_os.py
index aac1f7c..05e51ff 100644
--- a/desktop_env/evaluators/metrics/basic_os.py
+++ b/desktop_env/evaluators/metrics/basic_os.py
@@ -1,6 +1,3 @@
-import subprocess
-
-
 def check_gnome_favorite_apps(apps_str: str, rule):
     # parse the string like "['thunderbird.desktop', 'vim.desktop', 'google-chrome.desktop']"
     # to a list of strings
@@ -57,6 +54,7 @@ def check_moved_jpgs(directory_list, rule):
     else:
         return 0
 
+
 def is_in_vm_clickboard(config, terminal_output):
     print("terminal_output: ")
     print(terminal_output)
@@ -67,4 +65,4 @@ def is_in_vm_clickboard(config, terminal_output):
     if not isinstance(expected_results, list):
         return 1 if expected_results in terminal_output else 0
     else:
-        return 1 if all(result in terminal_output for result in expected_results) else 0
\ No newline at end of file
+        return 1 if all(result in terminal_output for result in expected_results) else 0
diff --git a/desktop_env/evaluators/metrics/calc.py b/desktop_env/evaluators/metrics/calc.py
deleted file mode 100644
index 0ff0744..0000000
--- a/desktop_env/evaluators/metrics/calc.py
+++ /dev/null
@@ -1,41 +0,0 @@
-import logging
-from typing import List
-
-import openpyxl
-
-logger = logging.getLogger("desktopenv.metrics.calc")
-
-
-def compare_conference_city_in_order(actual_city_list_path, expected_city):
-    expected_city_list = expected_city["expected"]
-    wb = openpyxl.load_workbook(actual_city_list_path)
-    sheet = wb.active
-    actual_city_list = []
-    for row in sheet["C2:C22"]:
-        for cell in row:
-            actual_city_list.append(cell.value)
-    # expected_city is the city that we want to compare with the actual city list
-    # must in order index
-    # debug
-    try:
-        for i in range(len(actual_city_list)):
-            if isinstance(expected_city_list[i], str):
-                if expected_city_list[i] not in actual_city_list[i]:
-                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    return 0.
-
-
-            elif isinstance(expected_city_list[i], List):
-                if not any(possible_str in actual_city_list[i] for possible_str in expected_city_list[i]):
-                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
-                    return 0.
-
-            else:
-                raise TypeError("Expected city should be a string or a list of strings")
-
-    except:
-        return 0.
-
-    return 1.
diff --git a/desktop_env/evaluators/metrics/demo.py b/desktop_env/evaluators/metrics/demo.py
deleted file mode 100644
index 4c5b55d..0000000
--- a/desktop_env/evaluators/metrics/demo.py
+++ /dev/null
@@ -1,28 +0,0 @@
-import fitz  # PyMuPDF
-
-def extract_answers_from_pdf(pdf_file):
-    # 打开PDF文件
-    doc = fitz.open(pdf_file)
-    answers = []
-
-    # 遍历每一页
-    for page in doc:
-        # 提取当前页的文本
-        text = page.get_text() 
-        # 分割文本为行
-        lines = text.split('\n')
-        for line in lines:
-            if line.strip():  # 排除空白行
-                # 分割等号，提取答案
-                parts = line.split('=')
-                if len(parts) > 1:
-                    answer = parts[-1].strip()  # 取等号后的部分为答案
-                    answers.append(answer)
-    
-    return answers
-
-# 假设你的文件名是'math_problems.pdf'
-pdf_file = '/Users/lxc/Desktop/calculus.pdf'
-answers = extract_answers_from_pdf(pdf_file)
-for i, answer in enumerate(answers, 1):
-    print(f"题目{i}的答案是: {answer}")
diff --git a/desktop_env/evaluators/metrics/libreoffice.py b/desktop_env/evaluators/metrics/libreoffice.py
index 441d932..1870c34 100644
--- a/desktop_env/evaluators/metrics/libreoffice.py
+++ b/desktop_env/evaluators/metrics/libreoffice.py
@@ -26,13 +26,3 @@ def check_libre_locale(config_file: str, rules: Dict[str, List[str]]) -> float:
                      for ptn in rules["locale_set"]
                      )
                  )
-
-
-if __name__ == "__main__":
-    path1 = "../../任务数据/LibreOffice Calc/registrymodifications.ru.xcu"
-    print(check_libre_locale(path1, {"locale_set": ["ru-*", "de-*", "fr-*"
-        , "pt-*", "es-*", "it-*"
-                                                    ]
-                                     }
-                             )
-          )
diff --git a/desktop_env/evaluators/metrics/others.py b/desktop_env/evaluators/metrics/others.py
index 706bcf3..ebb5994 100644
--- a/desktop_env/evaluators/metrics/others.py
+++ b/desktop_env/evaluators/metrics/others.py
@@ -1,20 +1,20 @@
-import zipfile
-import os.path
+import logging
 import os
+import os.path
+import zipfile
+from typing import List, Dict
+from typing import Union, TypeVar
 
 import lxml.html
 from lxml.html import HtmlElement
-from typing import List, Dict
-from typing import Union, TypeVar
 from mutagen.easyid3 import EasyID3
 
 from .general import diff_text_file
 from .utils import _match_value_to_rule
 
-import logging
-
 logger = logging.getLogger("desktopenv.metric.others")
 
+
 def process_epub(filename: str) -> List[str]:
     file_list: List[str] = []
 
@@ -23,7 +23,7 @@ def process_epub(filename: str) -> List[str]:
 
     try:
         with zipfile.ZipFile(filename, "r") as z_f:
-            with z_f.open("toc.ncx") as in_f\
+            with z_f.open("toc.ncx") as in_f \
                     , open(os.path.join(base_dir, "toc.ncx"), "w") as out_f:
                 contents: str = in_f.read().decode()
                 contents = contents.splitlines()
@@ -31,7 +31,7 @@ def process_epub(filename: str) -> List[str]:
                     if "navPoint" not in l:
                         out_f.write(l + "\n")
             file_list.append(os.path.join(base_dir, "toc.ncx"))
-            with z_f.open("content.opf") as in_f\
+            with z_f.open("content.opf") as in_f \
                     , open(os.path.join(base_dir, "content.opf"), "w") as out_f:
                 contents: str = in_f.read().decode()
                 contents = contents.splitlines()
@@ -41,14 +41,14 @@ def process_epub(filename: str) -> List[str]:
             file_list.append(os.path.join(base_dir, "content.opf"))
             for f_n in z_f.namelist():
                 if f_n.endswith(".html"):
-                    with z_f.open(f_n) as in_f\
+                    with z_f.open(f_n) as in_f \
                             , open(os.path.join(base_dir, f_n), "w") as out_f:
                         html: HtmlElement = lxml.html.fromstring(
-                                                ''.join( filter( lambda ch: ch!="\n" and ch!="\r"
-                                                               , in_f.read().decode()
-                                                               )
-                                                       ).encode()
-                                              )
+                            ''.join(filter(lambda ch: ch != "\n" and ch != "\r"
+                                           , in_f.read().decode()
+                                           )
+                                    ).encode()
+                        )
                         out_f.write(lxml.html.tostring(html, pretty_print=True, encoding="unicode"))
                     file_list.append(os.path.join(base_dir, f_n))
         logger.debug("%s: %s", filename, file_list)
@@ -56,6 +56,7 @@ def process_epub(filename: str) -> List[str]:
     except zipfile.BadZipFile:
         return []
 
+
 def compare_epub(result: str, expected: str) -> float:
     if result is None:
         return 0.
@@ -69,8 +70,10 @@ def compare_epub(result: str, expected: str) -> float:
         metric *= current_metric
     return metric
 
+
 V = TypeVar("Value")
 
+
 def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bool:
     # checks using _match_value_to_rule
     if result is None:
@@ -85,44 +88,3 @@ def check_mp3_meta(result: str, meta: Dict[str, Dict[str, Union[str, V]]]) -> bo
         logger.debug("%s.%s: %s", result, k, value)
         metric = metric and _match_value_to_rule(value, r)
     return float(metric)
-
-if __name__ == "__main__":
-    import datetime
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    metric = check_mp3_meta( "snapshots/test/cache/3f05f3b9-29ba-4b6b-95aa-2204697ffc06/Cheng Xiang - Missing You - gt.mp3"
-                           , { "title": { "method": "eq"
-                                        , "ref": "Missing You"
-                                        }
-                             , "artist": { "method": "eq"
-                                         , "ref": "Cheng Xiang"
-                                         }
-                             }
-                           )
-    print(metric)
diff --git a/desktop_env/evaluators/metrics/pdf.py b/desktop_env/evaluators/metrics/pdf.py
index d6644d1..ef5b384 100644
--- a/desktop_env/evaluators/metrics/pdf.py
+++ b/desktop_env/evaluators/metrics/pdf.py
@@ -2,6 +2,7 @@ import operator
 from typing import Any
 from typing import Dict
 
+import fitz  # PyMuPDF
 from pypdf import PdfReader
 
 
@@ -11,3 +12,20 @@ def check_pdf_pages(pdf_file: str, rules: Dict[str, Any]) -> float:
     reader = PdfReader(pdf_file)
     nb_pages: int = len(reader.pages)
     return float(getattr(operator, rules["relation"])(nb_pages, rules["ref_value"]))
+
+
+def extract_answers_from_pdf(pdf_file):
+    doc = fitz.open(pdf_file)
+    answers = []
+
+    for page in doc:
+        text = page.get_text()
+        lines = text.split('\n')
+        for line in lines:
+            if line.strip():
+                parts = line.split('=')
+                if len(parts) > 1:
+                    answer = parts[-1].strip()
+                    answers.append(answer)
+
+    return answers
diff --git a/desktop_env/evaluators/metrics/slides.py b/desktop_env/evaluators/metrics/slides.py
index 76217e1..1c2e04f 100644
--- a/desktop_env/evaluators/metrics/slides.py
+++ b/desktop_env/evaluators/metrics/slides.py
@@ -165,23 +165,24 @@ def compare_pptx_files(file1_path, file2_path, **options):
     # compare the content of each slide
     for slide1, slide2 in zip(prs1.slides, prs2.slides):
         slide_idx += 1
+
         def get_slide_background_color(slide):
             background = slide.background
             if background.fill.background():
                 return background.fill.fore_color.rgb
             else:
                 return None
-            
+
         if get_slide_background_color(slide1) != get_slide_background_color(slide2) and examine_background_color:
             return 0
-        
+
         def get_slide_notes(slide):
             notes_slide = slide.notes_slide
             if notes_slide:
                 return notes_slide.notes_text_frame.text
             else:
                 return None
-        
+
         if get_slide_notes(slide1).strip() != get_slide_notes(slide2).strip() and examine_note:
             return 0
         # check if the shapes are the same
@@ -192,14 +193,14 @@ def compare_pptx_files(file1_path, file2_path, **options):
                         return 0
                 elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                     return 0
-            
+
             if examine_table_bottom_position:
                 if slide_idx == 3 and shape1.shape_type == 19 and shape2.shape_type == 19:
                     if shape1.top <= shape2.top or shape1.top < 3600000:
                         return 0
                 elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                     return 0
-                    
+
             if examine_right_position:
                 if slide_idx == 2 and not hasattr(shape1, "text") and not hasattr(shape2, "text"):
                     if shape1.left <= shape2.left or shape1.left < 4320000:
@@ -207,28 +208,31 @@ def compare_pptx_files(file1_path, file2_path, **options):
 
             if examine_top_position:
                 if slide_idx == 2 and shape1.shape_type == 13 and shape2.shape_type == 13:
-                        if shape1.top >= shape2.top or shape1.top > 1980000:
-                            return 0
+                    if shape1.top >= shape2.top or shape1.top > 1980000:
+                        return 0
                 elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                     return 0
-            
+
             if examine_shape_for_shift_size:
                 if shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
-                    if not (hasattr(shape1, "text") and hasattr(shape2, "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
+                    if not (hasattr(shape1, "text") and hasattr(shape2,
+                                                                "text") and shape1.text == shape2.text and shape1.text == "Elaborate on what you want to discuss."):
                         return 0
-                                
-            if (shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
+
+            if (
+                    shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height) and examine_shape:
                 return 0
-            
+
             if examine_image_size:
                 if shape1.shape_type == 13 and shape2.shape_type == 13:
                     if shape1.width != shape2.width or shape1.height != shape2.height:
                         return 0
                 elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
                     return 0
-            
+
             if examine_modify_height:
-                if not hasattr(shape1, "text") and not hasattr(shape2, "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
+                if not hasattr(shape1, "text") and not hasattr(shape2,
+                                                               "text") or shape1.shape_type == 5 and shape2.shape_type == 5:
                     if shape1.height != shape2.height:
                         return 0
                 elif shape1.left != shape2.left or shape1.top != shape2.top or shape1.width != shape2.width or shape1.height != shape2.height:
@@ -236,13 +240,13 @@ def compare_pptx_files(file1_path, file2_path, **options):
 
             if hasattr(shape1, "text") and hasattr(shape2, "text"):
                 if shape1.text.strip() != shape2.text.strip() and examine_text:
-                    return 0    
-                
-                # check if the paragraphs are the same
+                    return 0
+
+                    # check if the paragraphs are the same
                 for para1, para2 in zip(shape1.text_frame.paragraphs, shape2.text_frame.paragraphs):
                     if para1.alignment != para2.alignment and examine_alignment:
                         return 0
-                        
+
                     # check if the runs are the same
                     if para1.text != para2.text and examine_text:
                         return 0
@@ -253,7 +257,7 @@ def compare_pptx_files(file1_path, file2_path, **options):
                     for run1, run2 in zip(para1.runs, para2.runs):
 
                         # check if the font properties are the same                        
-                        if run1.font.name != run2.font.name and examine_font_name:                            
+                        if run1.font.name != run2.font.name and examine_font_name:
                             return 0
 
                         if run1.font.size != run2.font.size and examine_font_size:
@@ -305,10 +309,9 @@ def compare_pptx_files(file1_path, file2_path, **options):
 
                             return bullets
 
-                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(run2.part.blob.decode('utf-8')):
+                        if examine_bullets and _extract_bullets(run1.part.blob.decode('utf-8')) != _extract_bullets(
+                                run2.part.blob.decode('utf-8')):
                             return 0
-                        
-
 
                     # fixme: Actually there are more properties to be compared, we can add them later via parsing the xml data
 
@@ -524,15 +527,3 @@ def check_auto_saving_time(pptx_file, rules):
         logger.error(f"Error parsing XML: {e}")
     except FileNotFoundError:
         logger.error(f"File not found: {pptx_file}")
-
-
-if __name__ == '__main__':
-    # print(compare_pptx_files(
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx",
-    #     r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\550ce7e7-747b-495f-b122-acdc4d0b8e54\New_Club_Spring_2018_Training_Gold.pptx"))
-    # print(evaluate_presentation_fill_to_rgb_distance(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\3b27600c-3668-4abd-8f84-7bcdebbccbdb\lec17-gui-events.pptx", {"rgb": (0, 0, 255)}))
-    # print(check_auto_saving_time(r"C:\Users\tianbaox\Desktop\DesktopEnv\cache\2cd43775-7085-45d8-89fa-9e35c0a915cf\registrymodifications.xcu", {"minutes": 3}))
-    print(compare_pptx_files(
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6_Gold.pptx",
-        r"D:\NJU\HKUNLP\Desktop-Env\DesktopEnv\cache\08aced46-45a2-48d7-993b-ed3fb5b32302\22_6.pptx",
-        examine_shape=False))
diff --git a/desktop_env/evaluators/metrics/table.py b/desktop_env/evaluators/metrics/table.py
index dadd138..9e888c7 100644
--- a/desktop_env/evaluators/metrics/table.py
+++ b/desktop_env/evaluators/metrics/table.py
@@ -11,15 +11,15 @@ import openpyxl
 import pandas as pd
 from openpyxl import Workbook
 from openpyxl.cell.cell import Cell
-from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.utils import get_column_letter
+from openpyxl.worksheet.cell_range import MultiCellRange
 from openpyxl.worksheet.datavalidation import DataValidation
 from openpyxl.worksheet.worksheet import Worksheet
+from rapidfuzz import fuzz
 
 from desktop_env.evaluators.metrics.utils import _match_value_to_rule, _read_cell_style, read_cell_value
 from desktop_env.evaluators.metrics.utils import load_charts, load_sparklines, load_rows_or_cols, load_xlsx_styles \
     , load_filters, load_pivot_tables
-from rapidfuzz import fuzz
 
 # from openpyxl.utils import coordinate_to_tuple
 
@@ -165,7 +165,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:
             logger.debug("Sheet1: \n%s", str(sheet1))
             logger.debug("Sheet2: \n%s", str(sheet2))
             try:
-                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1==sheet2))
+                logger.debug("Sheet1 =v= Sheet2: \n%s", str(sheet1 == sheet2))
             except:
                 logger.debug("Sheet1 =/v= Sheet2")
             logger.debug("Assertion: %s =v= %s - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
@@ -231,14 +231,14 @@ def compare_table(result: str, expected: str = None, **options) -> float:
                             value1 = value1.lower()
                             value2 = value2.lower()
 
-                        if rl["type"]=="includes":
+                        if rl["type"] == "includes":
                             metric: bool = value2 in value1
-                        elif rl["type"]=="included_by":
+                        elif rl["type"] == "included_by":
                             metric: bool = value1 in value2
-                        elif rl["type"]=="fuzzy_match":
+                        elif rl["type"] == "fuzzy_match":
                             metric: bool = fuzz.ratio(value1, value2) >= rl.get("threshold", 85.)
-                        elif rl["type"]=="exact_match":
-                            metric: bool = value1==value2
+                        elif rl["type"] == "exact_match":
+                            metric: bool = value1 == value2
                         total_metric = total_metric and metric
 
             metric: bool = total_metric
@@ -409,7 +409,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:
 
             filters1: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
             filters2: Dict[str, Any] = load_filters(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = filters1==filters2
+            metric: bool = filters1 == filters2
             logger.debug("Assertion: %s[filter] == %s[filter] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
             #  }}} Compare Filters # 
 
@@ -421,7 +421,7 @@ def compare_table(result: str, expected: str = None, **options) -> float:
 
             pivots1: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx0"], xlworkbookr, xlworkbooke), **r)
             pivots2: Dict[str, Any] = load_pivot_tables(*parse_idx(r["sheet_idx1"], xlworkbookr, xlworkbooke), **r)
-            metric: bool = pivots1==pivots2
+            metric: bool = pivots1 == pivots2
             logger.debug("Assertion: %s[pivot]==%s[pivot] - %s", r["sheet_idx0"], r["sheet_idx1"], metric)
             #  }}} Compare Pivot Tables # 
 
@@ -482,81 +482,36 @@ def compare_csv(result: str, expected: str, **options) -> float:
     return float(metric)
 
 
-if __name__ == '__main__':
-    import datetime
-    import sys
+def compare_conference_city_in_order(actual_city_list_path, expected_city):
+    expected_city_list = expected_city["expected"]
+    wb = openpyxl.load_workbook(actual_city_list_path)
+    sheet = wb.active
+    actual_city_list = []
+    for row in sheet["C2:C22"]:
+        for cell in row:
+            actual_city_list.append(cell.value)
+    # expected_city is the city that we want to compare with the actual city list
+    # must in order index
+    # debug
+    try:
+        for i in range(len(actual_city_list)):
+            if isinstance(expected_city_list[i], str):
+                if expected_city_list[i] not in actual_city_list[i]:
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.
 
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
 
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            elif isinstance(expected_city_list[i], List):
+                if not any(possible_str in actual_city_list[i] for possible_str in expected_city_list[i]):
+                    logger.debug(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    print(f"Expected city {expected_city_list[i]}; Actual city {actual_city_list[i]}")
+                    return 0.
 
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
+            else:
+                raise TypeError("Expected city should be a string or a list of strings")
 
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
+    except:
+        return 0.
 
-    formatter = logging.Formatter(
-        fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    path1 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday.xlsx"
-    path2 = "snapshots/test/cache/4e6fcf72-daf3-439f-a232-c434ce416af6/Employee_Age_By_Birthday_gold.xlsx"
-    rules = [ { "type": "sheet_data"
-              , "sheet_idx0": 0
-              , "sheet_idx1": "EI0"
-              }
-            ]
-    print(compare_table(path1, path2
-                        , rules=rules
-                        )
-          )
-    print(compare_table(path2, path2
-                        , rules=rules
-                        )
-          )
-
-    # Row Properties
-    # path1 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Date_Budget_Variance_HideNA_gold.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path1)
-    # worksheet: Worksheet = workbook.active
-    # for r_no, dms in worksheet.column_dimensions.items():
-    # print(r_no, type(r_no), type(dms), dms.hidden)
-
-    # Conditional Formats
-    # import formulas
-    # path1 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days.xlsx"
-    # path2 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold.xlsx"
-    # path3 = "../../任务数据/LibreOffice Calc/Calendar_Highlight_Weekend_Days_gold_test.xlsx"
-    # workbook: Workbook = openpyxl.load_workbook(filename=path2)
-    # worksheet: Worksheet = workbook.active
-    # print(worksheet.conditional_formatting)
-    # for itm in worksheet.conditional_formatting:
-    # print(itm.cells)
-    # for r in itm.rules:
-    # print( r.type, r.formula, r.dxf.font.color.rgb
-    # , r.dxf.fill.fgColor.rgb, r.dxf.fill.bgColor.rgb
-    # )
-    # condition = formulas.Parser().ast("=" + r.formula[0])[1].compile()
-    ##print(r.type, r.operator, r.dxfId, r.dxf)
-    # for r in itm.cells:
-    # for c in r.cells:
-    # value = worksheet.cell(row=c[0], column=c[1]).value
-    # print(value, condition(str(value)))
+    return 1.
diff --git a/desktop_env/evaluators/metrics/thunderbird.py b/desktop_env/evaluators/metrics/thunderbird.py
index 90b8892..5b7aaa0 100644
--- a/desktop_env/evaluators/metrics/thunderbird.py
+++ b/desktop_env/evaluators/metrics/thunderbird.py
@@ -1,17 +1,19 @@
+import json
+import logging
+import re
 from typing import List, Pattern, Dict, Match
 from typing import Union, Any, TypeVar, Callable
 
-import re
-import json
 from .utils import _match_record
 from .utils import _match_value_to_rule as _match_pref
 
-import logging
 logger = logging.getLogger("desktopenv.metric.thunderbird")
 
 V = TypeVar("Value")
 
 _pref_pattern: Pattern[str] = re.compile(r'^user_pref\("(?P<key>(?:[^"]|\\")+)\", (?P<val>.+)\);$');
+
+
 def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any]]]):
     """
     Args:
@@ -51,10 +53,10 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any
                 continue
 
             key: str = match_.group("key")
-            #value: str = match_.group("val")
-            #if value in {"true", "false"}:
-                #value = value.title()
-            #value: V = eval(value)
+            # value: str = match_.group("val")
+            # if value in {"true", "false"}:
+            # value = value.title()
+            # value: V = eval(value)
             value = json.loads(match_.group("val"))
             if key in expect_rules:
                 logger.debug("K: %s, V: %s", key, repr(value))
@@ -64,9 +66,13 @@ def check_thunderbird_prefs(result: str, rule: Dict[str, Dict[str, Dict[str, Any
 
     return float(all(expect_metrics.values()) and unexpect_metric)
 
+
 _value_processor: Callable[[str], str] = lambda val: val.replace("\\\"", "\"").replace("\\\\", "\\")
-#_condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
-_condition_pattern: Pattern[str] = re.compile(r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+# _condition_pattern: Pattern[str] = re.compile(r'(?P<type>AND|OR) \((?P<key>[\w ]+),(?P<rel>[\w ' + '\'' + r']+),(?:"(?P<val2>(?:[^"]|\")+)"|(?P<val1>[^)]+))\)')
+_condition_pattern: Pattern[str] = re.compile(
+    r'\b(?:AND|OR) \((?:[\w ]+),(?:[\w ' + '\'' + r']+),(?:"(?:(?:[^"]|\")+)"|(?:[^)]+))\)|\bALL\b')
+
+
 def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]) -> float:
     """
     Args:
@@ -112,8 +118,8 @@ def check_thunderbird_filter(result: str, rules: Dict[str, List[Dict[str, str]]]
                 condition_str: str = _value_processor(l[11:-2])
                 logger.debug("FILTER CONDITION: %s", condition_str)
 
-                conditions: List[str] =\
-                        _condition_pattern.findall(condition_str)
+                conditions: List[str] = \
+                    _condition_pattern.findall(condition_str)
                 logger.debug("FILTER CONDITIONS: %s", repr(conditions))
 
                 filter_["condition"] = conditions
@@ -138,6 +144,7 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
         remove_deleted (bool): ignore deleted messages which has status code 0008 or 0009. default: True
         remove_duplicate (bool): remove duplicate messages. default: True
     """
+
     def normalize_msg(msg, options):
         ignore_status = options.get('ignore_status', False)
         ignore_keys = options.get('ignore_keys', False)
@@ -167,66 +174,3 @@ def check_thunderbird_folder(result: Union[str, List[str]], reference: Union[str
         mail2 = read_thunderbird_folder_file(gold)
         if mail1 != mail2: return .0
     return 1.0
-
-
-if __name__ == "__main__":
-    #import lxml.etree
-    #from lxml.cssselect import CSSSelector
-    #from lxml.etree import _Element
-
-    #xml = "../../任务数据/Thunderbird/vertical-card-view.xml"
-    #xml = "../../任务数据/Thunderbird/vertical-table-view.xml"
-    #at: _Element = lxml.etree.parse(xml)
-
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] page-tab-list')(at) # page tab tags
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section:nth-child(3)')(at) # email tag page
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>section[attr|class~="tree-table-header"]>table-row>column-header[name=Subject]>push-button', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #elements: List[_Element] = CSSSelector('application[name=Thunderbird] panel>scroll-pane>internal-frame>panel[name$="anonym-x2024@outlook.com"]>section[attr|id=threadPane]>section[attr|id="threadTree"]>table[attr|class="tree-table"]>tree>tree-item>section[name="Subject"]>section>section', namespaces={"attr": "uri:deskat:attributes.at-spi.gnome.org"})(at) # table view, column header
-    #print(len(elements))
-    #for elm in elements:
-        #print(lxml.etree.tostring(elm, encoding="unicode", pretty_print=True))
-
-    import datetime
-    import os
-    import sys
-
-    logger = logging.getLogger()
-    logger.setLevel(logging.DEBUG)
-
-    datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-    file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)))
-    debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)))
-    stdout_handler = logging.StreamHandler(sys.stdout)
-    sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)))
-
-    file_handler.setLevel(logging.INFO)
-    debug_handler.setLevel(logging.DEBUG)
-    stdout_handler.setLevel(logging.INFO)
-    sdebug_handler.setLevel(logging.DEBUG)
-
-    formatter = logging.Formatter(fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-    file_handler.setFormatter(formatter)
-    debug_handler.setFormatter(formatter)
-    stdout_handler.setFormatter(formatter)
-    sdebug_handler.setFormatter(formatter)
-
-    stdout_handler.addFilter(logging.Filter("desktopenv"))
-    sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-    logger.addHandler(file_handler)
-    logger.addHandler(debug_handler)
-    logger.addHandler(stdout_handler)
-    logger.addHandler(sdebug_handler)
-
-    print( check_thunderbird_filter( "../../任务数据/Thunderbird/msgFilterRules.dat"
-                                   , { "expect": [ { "enabled": "yes"
-                                                   , "action": "Move to folder"
-                                                   , "actionValue": "mailbox://nobody@Local%20Folders/Promotions"
-                                                   , "condition": ["AND (subject,contains,discount)"]
-                                                   }
-                                                 ]
-                                     }
-                                   )
-        )
diff --git a/desktop_env/evaluators/metrics/vscode.py b/desktop_env/evaluators/metrics/vscode.py
index f06623a..d207aae 100644
--- a/desktop_env/evaluators/metrics/vscode.py
+++ b/desktop_env/evaluators/metrics/vscode.py
@@ -236,6 +236,9 @@ def check_html_background_image(src_path: str, rule: Dict = None) -> float:
     Check if the background image is correctly set.
     multi-app:bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108
     """
+    if not src_path:
+        return 0.0
+
     from bs4 import BeautifulSoup
     with open(src_path, 'r') as f:
         html_content = f.read()
@@ -252,6 +255,9 @@ def compare_result_files(src_path, tgt_path):
     Compare whether the content of two files are the same.
     multi-app:7f35355e-02a6-45b5-b140-f0be698bcf85
     """
+    if not src_path or not tgt_path:
+        return 0.0
+
     with open(src_path, 'r') as f:
         src_content = f.read().strip()
     with open(tgt_path, 'r') as f:
@@ -271,12 +277,3 @@ def compare_result_files(src_path, tgt_path):
         if src_content == tgt_content:
             return 1.0
     return 0.0
-
-
-if __name__ == "__main__":
-    src_path = "../../../cache/bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108/index.html"
-    rule = {
-        "type:": "value",
-        "value": "anmi_sharper.png"
-    }
-    print(check_html_background_image(src_path, rule))
diff --git a/desktop_env/server/main.py b/desktop_env/server/main.py
index f691724..d53232e 100644
--- a/desktop_env/server/main.py
+++ b/desktop_env/server/main.py
@@ -63,7 +63,7 @@ def execute_command():
 
     # Execute the command without any safety checks.
     try:
-        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True)
+        result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell, text=True, timeout=120)
         return jsonify({
             'status': 'success',
             'output': result.stdout,
@@ -117,7 +117,7 @@ def launch_app():
 def capture_screen_with_cursor():
     # fixme: when running on virtual machines, the cursor is not captured, don't know why
 
-    file_path = os.path.join("screenshots", "screenshot.png")
+    file_path = os.path.join(os.path.dirname(__file__), "screenshots", "screenshot.png")
     user_platform = platform.system()
 
     # Ensure the screenshots directory exists
@@ -284,6 +284,15 @@ def _create_atspi_node(node: Accessible, depth: int = 0, flag: Optional[str] = N
         text = text.replace("\ufffc", "").replace("\ufffd", "")
     #  }}} Text # 
 
+    #  Image {{{ # 
+    try:
+        node.queryImage()
+    except NotImplementedError:
+        pass
+    else:
+        attribute_dict["image"] = "true"
+    #  }}} Image # 
+
     #  Selection {{{ # 
     try:
         node.querySelection()
diff --git a/desktop_env/server/osbench_server.service b/desktop_env/server/osbench_server.service
new file mode 100644
index 0000000..d0fa216
--- /dev/null
+++ b/desktop_env/server/osbench_server.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=:1"
+
+[Install]
+WantedBy=graphical.target
diff --git a/desktop_env/server/osbench_server@.service b/desktop_env/server/osbench_server@.service
new file mode 100644
index 0000000..87fc59f
--- /dev/null
+++ b/desktop_env/server/osbench_server@.service
@@ -0,0 +1,16 @@
+[Unit]
+Description=OSBench Server
+StartLimitIntervalSec=60
+StartLimitBurst=4
+After=network.target auditd.service
+
+[Service]
+ExecStart=/usr/bin/python3 /home/user/main.py
+User=user
+WorkingDirectory=/home/user
+Restart=on-failure
+RestartSec=1
+Environment="DISPLAY=%i"
+
+[Install]
+WantedBy=graphical.target
diff --git a/evaluation_examples/examples/sheetcopilot/0326d92d-d218-48a8-9ca1-981cd6d064c7.json b/evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
rename to evaluation_examples/examples/libreoffice_calc/0326d92d-d218-48a8-9ca1-981cd6d064c7.json
diff --git a/evaluation_examples/examples/sheetcopilot/035f41ba-6653-43ab-aa63-c86d449d62e5.json b/evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/035f41ba-6653-43ab-aa63-c86d449d62e5.json
rename to evaluation_examples/examples/libreoffice_calc/035f41ba-6653-43ab-aa63-c86d449d62e5.json
diff --git a/evaluation_examples/examples/sheetcopilot/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json b/evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
rename to evaluation_examples/examples/libreoffice_calc/04d9aeaf-7bed-4024-bedb-e10e6f00eb7f.json
diff --git a/evaluation_examples/examples/sheetcopilot/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json b/evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
rename to evaluation_examples/examples/libreoffice_calc/0a2e43bf-b26c-4631-a966-af9dfa12c9e5.json
diff --git a/evaluation_examples/examples/sheetcopilot/0acbd372-ca7a-4507-b949-70673120190f.json b/evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/0acbd372-ca7a-4507-b949-70673120190f.json
rename to evaluation_examples/examples/libreoffice_calc/0acbd372-ca7a-4507-b949-70673120190f.json
diff --git a/evaluation_examples/examples/sheetcopilot/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json b/evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
rename to evaluation_examples/examples/libreoffice_calc/12382c62-0cd1-4bf2-bdc8-1d20bf9b2371.json
diff --git a/evaluation_examples/examples/sheetcopilot/1273e544-688f-496b-8d89-3e0f40aa0606.json b/evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/1273e544-688f-496b-8d89-3e0f40aa0606.json
rename to evaluation_examples/examples/libreoffice_calc/1273e544-688f-496b-8d89-3e0f40aa0606.json
diff --git a/evaluation_examples/examples/sheetcopilot/163789f0-c895-4a50-8207-17cbdd56ec38.json b/evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/163789f0-c895-4a50-8207-17cbdd56ec38.json
rename to evaluation_examples/examples/libreoffice_calc/163789f0-c895-4a50-8207-17cbdd56ec38.json
diff --git a/evaluation_examples/examples/sheetcopilot/1954cced-e748-45c4-9c26-9855b97fbc5e.json b/evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/1954cced-e748-45c4-9c26-9855b97fbc5e.json
rename to evaluation_examples/examples/libreoffice_calc/1954cced-e748-45c4-9c26-9855b97fbc5e.json
diff --git a/evaluation_examples/examples/sheetcopilot/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json b/evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
rename to evaluation_examples/examples/libreoffice_calc/1d17d234-e39d-4ed7-b46f-4417922a4e7c.json
diff --git a/evaluation_examples/examples/sheetcopilot/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json b/evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
rename to evaluation_examples/examples/libreoffice_calc/1de60575-bb6e-4c3d-9e6a-2fa699f9f197.json
diff --git a/evaluation_examples/examples/sheetcopilot/1e8df695-bd1b-45b3-b557-e7d599cf7597.json b/evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
rename to evaluation_examples/examples/libreoffice_calc/1e8df695-bd1b-45b3-b557-e7d599cf7597.json
diff --git a/evaluation_examples/examples/sheetcopilot/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json b/evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
rename to evaluation_examples/examples/libreoffice_calc/21ab7b40-77c2-4ae6-8321-e00d3a086c73.json
diff --git a/evaluation_examples/examples/sheetcopilot/26a8440e-c166-4c50-aef4-bfb77314b46b.json b/evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/26a8440e-c166-4c50-aef4-bfb77314b46b.json
rename to evaluation_examples/examples/libreoffice_calc/26a8440e-c166-4c50-aef4-bfb77314b46b.json
diff --git a/evaluation_examples/examples/sheetcopilot/28047f4a-d877-4bea-95f7-e42b1c919957.json b/evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/28047f4a-d877-4bea-95f7-e42b1c919957.json
rename to evaluation_examples/examples/libreoffice_calc/28047f4a-d877-4bea-95f7-e42b1c919957.json
diff --git a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
index aba58cd..d4bbb32 100644
--- a/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
+++ b/evaluation_examples/examples/libreoffice_calc/2bd59342-0664-4ccb-ba87-79379096cc08.json
@@ -10,10 +10,6 @@
     "libreoffice_calc"
   ],
   "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
   }
-}
\ No newline at end of file
+}
diff --git a/evaluation_examples/examples/sheetcopilot/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json b/evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
rename to evaluation_examples/examples/libreoffice_calc/30e3e107-1cfb-46ee-a755-2cd080d7ba6a.json
diff --git a/evaluation_examples/examples/sheetcopilot/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json b/evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
rename to evaluation_examples/examples/libreoffice_calc/3a7c8185-25c1-4941-bd7b-96e823c9f21f.json
diff --git a/evaluation_examples/examples/sheetcopilot/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json b/evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
rename to evaluation_examples/examples/libreoffice_calc/4172ea6e-6b77-4edb-a9cc-c0014bd1603b.json
diff --git a/evaluation_examples/examples/sheetcopilot/42e0a640-4f19-4b28-973d-729602b5a4a7.json b/evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/42e0a640-4f19-4b28-973d-729602b5a4a7.json
rename to evaluation_examples/examples/libreoffice_calc/42e0a640-4f19-4b28-973d-729602b5a4a7.json
diff --git a/evaluation_examples/examples/sheetcopilot/447b9505-7a2f-4863-9dd1-69395482eb4b.json b/evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/447b9505-7a2f-4863-9dd1-69395482eb4b.json
rename to evaluation_examples/examples/libreoffice_calc/447b9505-7a2f-4863-9dd1-69395482eb4b.json
diff --git a/evaluation_examples/examples/sheetcopilot/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json b/evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
rename to evaluation_examples/examples/libreoffice_calc/4de54231-e4b5-49e3-b2ba-61a0bec721c0.json
diff --git a/evaluation_examples/examples/sheetcopilot/51719eea-10bc-4246-a428-ac7c433dd4b3.json b/evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/51719eea-10bc-4246-a428-ac7c433dd4b3.json
rename to evaluation_examples/examples/libreoffice_calc/51719eea-10bc-4246-a428-ac7c433dd4b3.json
diff --git a/evaluation_examples/examples/sheetcopilot/535364ea-05bd-46ea-9937-9f55c68507e8.json b/evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/535364ea-05bd-46ea-9937-9f55c68507e8.json
rename to evaluation_examples/examples/libreoffice_calc/535364ea-05bd-46ea-9937-9f55c68507e8.json
diff --git a/evaluation_examples/examples/sheetcopilot/5549c616-3cec-478e-940e-0c92fe9a10e3.json b/evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/5549c616-3cec-478e-940e-0c92fe9a10e3.json
rename to evaluation_examples/examples/libreoffice_calc/5549c616-3cec-478e-940e-0c92fe9a10e3.json
diff --git a/evaluation_examples/examples/sheetcopilot/5780a545-4e20-4230-95b4-cac135ef119f.json b/evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/5780a545-4e20-4230-95b4-cac135ef119f.json
rename to evaluation_examples/examples/libreoffice_calc/5780a545-4e20-4230-95b4-cac135ef119f.json
diff --git a/evaluation_examples/examples/sheetcopilot/5b5434c6-560c-47a1-a89f-929c688448f5.json b/evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/5b5434c6-560c-47a1-a89f-929c688448f5.json
rename to evaluation_examples/examples/libreoffice_calc/5b5434c6-560c-47a1-a89f-929c688448f5.json
diff --git a/evaluation_examples/examples/sheetcopilot/5d353deb-c4b0-4126-a99e-5490817b48cb.json b/evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/5d353deb-c4b0-4126-a99e-5490817b48cb.json
rename to evaluation_examples/examples/libreoffice_calc/5d353deb-c4b0-4126-a99e-5490817b48cb.json
diff --git a/evaluation_examples/examples/sheetcopilot/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json b/evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
rename to evaluation_examples/examples/libreoffice_calc/5f8601f8-6e90-4d2c-91bb-eb5836ad1d5c.json
diff --git a/evaluation_examples/examples/sheetcopilot/64db6b55-06de-451d-b325-17c487fdfee5.json b/evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/64db6b55-06de-451d-b325-17c487fdfee5.json
rename to evaluation_examples/examples/libreoffice_calc/64db6b55-06de-451d-b325-17c487fdfee5.json
diff --git a/evaluation_examples/examples/sheetcopilot/65551792-4c32-4904-983d-7c68c189b474.json b/evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/65551792-4c32-4904-983d-7c68c189b474.json
rename to evaluation_examples/examples/libreoffice_calc/65551792-4c32-4904-983d-7c68c189b474.json
diff --git a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
index 0ebfeaf..46d6e7c 100644
--- a/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
+++ b/evaluation_examples/examples/libreoffice_calc/7b802dad-6e0f-4204-9815-d4e3f57627d8.json
@@ -10,10 +10,6 @@
     "libreoffice_calc"
   ],
   "evaluator": {
-    "func": "infeasible",
-    "expected": {
-    },
-    "result": {
-    }
+    "func": "infeasible"
   }
-}
\ No newline at end of file
+}
diff --git a/evaluation_examples/examples/sheetcopilot/82a95e94-6344-415d-b212-37241610c7fd.json b/evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/82a95e94-6344-415d-b212-37241610c7fd.json
rename to evaluation_examples/examples/libreoffice_calc/82a95e94-6344-415d-b212-37241610c7fd.json
diff --git a/evaluation_examples/examples/sheetcopilot/852527e8-1b97-466c-a12f-b6b095df59bc.json b/evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/852527e8-1b97-466c-a12f-b6b095df59bc.json
rename to evaluation_examples/examples/libreoffice_calc/852527e8-1b97-466c-a12f-b6b095df59bc.json
diff --git a/evaluation_examples/examples/sheetcopilot/8909d1cb-5877-44c7-a908-9f1875302441.json b/evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/8909d1cb-5877-44c7-a908-9f1875302441.json
rename to evaluation_examples/examples/libreoffice_calc/8909d1cb-5877-44c7-a908-9f1875302441.json
diff --git a/evaluation_examples/examples/sheetcopilot/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json b/evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
rename to evaluation_examples/examples/libreoffice_calc/8fa9072b-ea9b-4679-84c6-420f3fe4c697.json
diff --git a/evaluation_examples/examples/sheetcopilot/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json b/evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
rename to evaluation_examples/examples/libreoffice_calc/96042ca2-6ea0-461c-8ba8-81efdc07bbf5.json
diff --git a/evaluation_examples/examples/sheetcopilot/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json b/evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
rename to evaluation_examples/examples/libreoffice_calc/97dd78c1-4ba3-4bfd-bbd4-c938532dbcc6.json
diff --git a/evaluation_examples/examples/sheetcopilot/9b534cd8-d497-4ca8-8444-82105b87d6f4.json b/evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
rename to evaluation_examples/examples/libreoffice_calc/9b534cd8-d497-4ca8-8444-82105b87d6f4.json
diff --git a/evaluation_examples/examples/sheetcopilot/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json b/evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
rename to evaluation_examples/examples/libreoffice_calc/9b6c0b72-3ecc-482d-a240-8ceab861d46e.json
diff --git a/evaluation_examples/examples/sheetcopilot/9ed02102-6b28-4946-8339-c028166e9512.json b/evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/9ed02102-6b28-4946-8339-c028166e9512.json
rename to evaluation_examples/examples/libreoffice_calc/9ed02102-6b28-4946-8339-c028166e9512.json
diff --git a/evaluation_examples/examples/sheetcopilot/a16d1eb7-941b-4edd-8c08-344213f939ad.json b/evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/a16d1eb7-941b-4edd-8c08-344213f939ad.json
rename to evaluation_examples/examples/libreoffice_calc/a16d1eb7-941b-4edd-8c08-344213f939ad.json
diff --git a/evaluation_examples/examples/sheetcopilot/b6da532f-9c4c-4e47-a302-a2c51972134f.json b/evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/b6da532f-9c4c-4e47-a302-a2c51972134f.json
rename to evaluation_examples/examples/libreoffice_calc/b6da532f-9c4c-4e47-a302-a2c51972134f.json
diff --git a/evaluation_examples/examples/sheetcopilot/b6e9778c-11b3-455f-b720-655048787484.json b/evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/b6e9778c-11b3-455f-b720-655048787484.json
rename to evaluation_examples/examples/libreoffice_calc/b6e9778c-11b3-455f-b720-655048787484.json
diff --git a/evaluation_examples/examples/sheetcopilot/c038008d-848a-4e20-abdb-a3e65a71a6cc.json b/evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
rename to evaluation_examples/examples/libreoffice_calc/c038008d-848a-4e20-abdb-a3e65a71a6cc.json
diff --git a/evaluation_examples/examples/sheetcopilot/cb074a90-17ca-4f2a-be85-6f3c354040be.json b/evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/cb074a90-17ca-4f2a-be85-6f3c354040be.json
rename to evaluation_examples/examples/libreoffice_calc/cb074a90-17ca-4f2a-be85-6f3c354040be.json
diff --git a/evaluation_examples/examples/sheetcopilot/cd159658-fff3-4f94-a518-fad4007a152a.json b/evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/cd159658-fff3-4f94-a518-fad4007a152a.json
rename to evaluation_examples/examples/libreoffice_calc/cd159658-fff3-4f94-a518-fad4007a152a.json
diff --git a/evaluation_examples/examples/sheetcopilot/cd3c4994-b9e2-426b-8157-f7978ff55501.json b/evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/cd3c4994-b9e2-426b-8157-f7978ff55501.json
rename to evaluation_examples/examples/libreoffice_calc/cd3c4994-b9e2-426b-8157-f7978ff55501.json
diff --git a/evaluation_examples/examples/sheetcopilot/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json b/evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
rename to evaluation_examples/examples/libreoffice_calc/de7a24c3-7f47-45c7-bba9-ba1aaaf015f8.json
diff --git a/evaluation_examples/examples/sheetcopilot/f13c9e86-3d6d-475f-b2bc-9557fe355236.json b/evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
rename to evaluation_examples/examples/libreoffice_calc/f13c9e86-3d6d-475f-b2bc-9557fe355236.json
diff --git a/evaluation_examples/examples/sheetcopilot/f654bf9a-dea2-472d-a877-edeeb12d7462.json b/evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/f654bf9a-dea2-472d-a877-edeeb12d7462.json
rename to evaluation_examples/examples/libreoffice_calc/f654bf9a-dea2-472d-a877-edeeb12d7462.json
diff --git a/evaluation_examples/examples/sheetcopilot/fe29cdf3-d317-47b3-a657-d61f97f00b88.json b/evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
similarity index 100%
rename from evaluation_examples/examples/sheetcopilot/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
rename to evaluation_examples/examples/libreoffice_calc/fe29cdf3-d317-47b3-a657-d61f97f00b88.json
diff --git a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
index c0d6ba0..053421c 100644
--- a/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
+++ b/evaluation_examples/examples/libreoffice_impress/ac9bb6cb-1888-43ab-81e4-a98a547918cd.json
@@ -63,6 +63,12 @@
       "type": "vm_file",
       "path": "/home/user/Desktop/saa-format-guide.pptx",
       "dest": "saa-format-guide.pptx"
+    },
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "color": "red"
+      }
     }
   }
 }
diff --git a/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json b/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
index a1903be..997a9b7 100644
--- a/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
+++ b/evaluation_examples/examples/multi_apps/0c825995-5b70-4526-b663-113f4c999dd2.json
@@ -94,7 +94,7 @@
     "result": {
       "type": "googledrive_file",
       "settings_file": "evaluation_examples/settings/googledrive/settings.yml",
-      "path": "environment_policy_report (draft).docx",
+      "path": ["environment_policy", "environment_policy_report (draft)"],
       "dest": "environment_policy_report (draft).docx"
     },
     "expected": {
diff --git a/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json b/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json
index 99e148b..fd85e1b 100644
--- a/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json
+++ b/evaluation_examples/examples/multi_apps/2b9493d7-49b8-493a-a71b-56cd1f4d6908.json
@@ -9,7 +9,7 @@
             "parameters": {
               "files": [
                 {
-                  "url": "https://drive.usercontent.google.com/download?id=104pg3yochKyH2Uvlp3BdvKmHgYmSIESu&export=download&authuser=0&confirm=t&uuid=d1926366-4e54-4a44-8dcd-fc49ed6524d7&at=APZUnTXcBFV9kcacsA0toU83lMKJ:1706505549057d",
+                  "url": "https://drive.usercontent.google.com/download?id=1gqqY56robX1tb4YPa3Yk1d72T_k-Rgz3&export=download&authuser=0&confirm=t",
                   "path": "/home/user/Desktop/15-MB-docx-file-download.docx"
                 }
               ]
diff --git a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
index 309f370..015e3a6 100644
--- a/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
+++ b/evaluation_examples/examples/multi_apps/3c8f201a-009d-4bbe-8b65-a6f8b35bb57f.json
@@ -1,7 +1,7 @@
 {
   "id": "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
   "snapshot": "gimp",
-  "instruction": "Use `gdown` to download the image from \"https://drive.google.com/uc?export=download&id=1i8j5dGS57sA07jEuPNAlQW-sn5uqUnuK\", and then use GIMP to compress it to under 600KB. Resize if needed.",
+  "instruction": "Download the image from \"https://drive.google.com/uc?export=download&id=1i8j5dGS57sA07jEuPNAlQW-sn5uqUnuK\", and then use GIMP to compress it to under 600KB as \"compressed.jpeg\" on the Desktop. Resize if needed.",
   "source": "",
   "config": [
     {
diff --git a/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json b/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
index a6adb54..0369148 100644
--- a/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
+++ b/evaluation_examples/examples/multi_apps/42f4d1c7-4521-4161-b646-0a8934e36081.json
@@ -1,7 +1,7 @@
 {
   "id": "42f4d1c7-4521-4161-b646-0a8934e36081",
   "snapshot": "gimp",
-  "instruction": "Configure VS Code to edit GIMP script-fu scripts effectively by installing lisp extension. Test by writing code to resizing the image as 128 * 128 as \"resized.png\"",
+  "instruction": "Configure VS Code to edit GIMP script-fu scripts effectively by installing lisp extension. Test by writing code to resize the image \"character.png\" to 128 * 128 as \"resized.png\".",
   "source": "",
   "config": [
     {
diff --git a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
index 0a70b11..447a862 100644
--- a/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
+++ b/evaluation_examples/examples/multi_apps/4c26e3f3-3a14-4d86-b44a-d3cedebbb487.json
@@ -30,12 +30,12 @@
     ],
     "evaluator": {
         "func": "check_brightness_decrease_and_structure_sim",
-        "expected": {
+        "result": {
             "type": "vm_file",
             "path": "/home/user/Desktop/background.png",
             "dest": "background.png"
         },
-        "result": {
+        "expected": {
             "type": "cloud_file",
             "path": "https://drive.usercontent.google.com/download?id=13if1UwZ5ay6ADAVW2jp3rcyvAEBse6MJ&export=download&authuser=0&confirm=t&uuid=2ea03068-1874-4240-baa1-f8bb2f917a99&at=APZUnTXq6dVlASg819jCaI1A-rm2:1710136385956",
             "dest": "image_original.png"
diff --git a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
index 3d32ee5..0f1c8ac 100644
--- a/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
+++ b/evaluation_examples/examples/multi_apps/51f5801c-18b3-4f25-b0c3-02f85507a078.json
@@ -9,7 +9,7 @@
             "parameters": {
               "files": [
                 {
-                  "url": "https://drive.usercontent.google.com/download?id=1e12nL_V7bffaLSocQ86EiGCdygzggWeu&export=download",
+                  "url": "https://drive.usercontent.google.com/download?id=1epTcblcYh8j_wFtA-aiXPIF2Oo1IVw8A&export=download",
                   "path": "/home/user/Desktop/Dickinson_Slides.pptx"
                 }
               ]
@@ -36,7 +36,7 @@
         },
         "expected": {
             "type": "cloud_file",
-            "path": "https://drive.usercontent.google.com/download?id=1Xl6tgQ0K5qA1BDA2fKTK2xFLzXwbtkZ6&export=download",
+            "path": "https://drive.usercontent.google.com/download?id=1vUvaQLJUtFgbZi7lSzl0y0TS_WecFczm&export=download",
             "dest": "notes_gold.docx"
         },
         "options": {
diff --git a/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json b/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json
index ba554e0..2d40c93 100644
--- a/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json
+++ b/evaluation_examples/examples/multi_apps/91190194-f406-4cd6-b3f9-c43fac942b22.json
@@ -11,10 +11,6 @@
           {
             "url": "https://drive.google.com/uc?export=download&id=1bmSRNNh4JkF6izrKrmynUHarf0pFES50",
             "path": "/home/user/Desktop/cola.png"
-          },
-          {
-            "url": "https://drive.google.com/uc?export=download&id=1MayrIPJWRK7cMEVe3TxYmgkAbVMrYcQA",
-            "path": "/home/user/Desktop/cropped_gold.png"
           }
         ]
       }
@@ -43,8 +39,8 @@
       "dest": "cropped.png"
     },
     "expected": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/cropped_gold.png",
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?export=download&id=1MayrIPJWRK7cMEVe3TxYmgkAbVMrYcQA",
       "dest": "cropped_gold.png"
     }
   }
diff --git a/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json b/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json
index 3d08199..c7b01f9 100644
--- a/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json
+++ b/evaluation_examples/examples/multi_apps/98e8e339-5f91-4ed2-b2b2-12647cb134f4.json
@@ -1,7 +1,7 @@
 {
   "id": "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
   "snapshot": "vs_code",
-  "instruction": "Merge the contents of all .txt files from your vscode project into a single document in Writer. No merging separator is needed. Ensure to set the overall font size of the document to 10.",
+  "instruction": "Merge the contents of all .txt files from your vscode project into a single document \"concat.docx\" on Desktop with libreoffice writer. No merging separator is needed. Ensure to set the overall font size of the document to 10.",
   "source": "",
   "config": [
     {
diff --git a/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json b/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json
index 7b7a0d5..c869428 100644
--- a/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json
+++ b/evaluation_examples/examples/multi_apps/b5062e3e-641c-4e3a-907b-ac864d2e7652.json
@@ -38,7 +38,7 @@
 			}
 		},
 		{
-			"type": "execute",
+			"type": "launch",
 			"parameters": {
 				"command": [
 					"nautilus",
@@ -109,4 +109,4 @@
 			]
 		}
 	}
-}
\ No newline at end of file
+}
diff --git a/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json b/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json
index 55af9af..29573aa 100644
--- a/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json
+++ b/evaluation_examples/examples/multi_apps/d68204bf-11c1-4b13-b48b-d303c73d4bf6.json
@@ -11,10 +11,6 @@
           {
             "url": "https://drive.google.com/uc?export=download&id=1CPGW_OZsfSWDdTU7CFrTjpzSAASyLy4w",
             "path": "/home/user/Desktop/tilearray.png"
-          },
-          {
-            "url": "https://drive.google.com/uc?export=download&id=1aHwmnxL2CKEh_FhVpevY452-BQH2t5rG",
-            "path": "/home/user/Desktop/rearranged_gold.png"
           }
         ]
       }
@@ -43,8 +39,8 @@
       "dest": "rearranged.png"
     },
     "expected": {
-      "type": "vm_file",
-      "path": "/home/user/Desktop/rearranged_gold.png",
+      "type": "cloud_file",
+      "path": "https://drive.google.com/uc?export=download&id=1aHwmnxL2CKEh_FhVpevY452-BQH2t5rG",
       "dest": "rearranged_gold.png"
     }
   }
diff --git a/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json b/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json
index 340f686..b591cfd 100644
--- a/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json
+++ b/evaluation_examples/examples/multi_apps/e2392362-125e-4f76-a2ee-524b183a3412.json
@@ -1,13 +1,17 @@
 {
   "id": "e2392362-125e-4f76-a2ee-524b183a3412",
   "snapshot": "chrome",
-  "instruction": "I recently started using the famous personal academic homepage template from academicpages.github.io to build my own personal homepage, and I have cloned it to my local ~/Code/Website folder. According to an online tutorial, I can configure my name and contact information in the _config.yaml file. However, I am not familiar with the YAML file format. Please help me find the sections related to the name and contact information in this file and change them to “Test Account” and “Test@gmail.com”.",
+  "instruction": "I recently started using the famous personal academic homepage template from academicpages.github.io to build my own personal homepage, and I have cloned it to my local ~/Code/Website folder. According to an online tutorial, I can configure my name and contact information in the _config.yaml file. However, I am not familiar with the YAML file format. Please help me find the sections related to the name and contact information in this file and change them to \"Test Account\" and \"Test@gmail.com\".",
   "source": "authors",
   "config": [
     {
       "type": "command",
       "parameters": {
-        "command": ["mkdir", "-p", "/home/user/Code/Website"]
+        "command": [
+          "mkdir",
+          "-p",
+          "/home/user/Code/Website"
+        ]
       }
     },
     {
@@ -24,13 +28,22 @@
     {
       "type": "execute",
       "parameters": {
-        "command": ["tar", "-xJvf", ".tmp.tar.xz", "-C", "/home/user/Code/Website/"]
+        "command": [
+          "tar",
+          "-xJvf",
+          ".tmp.tar.xz",
+          "-C",
+          "/home/user/Code/Website/"
+        ]
       }
     },
     {
       "type": "launch",
       "parameters": {
-        "command": ["google-chrome", "--remote-debugging-port=1337"]
+        "command": [
+          "google-chrome",
+          "--remote-debugging-port=1337"
+        ]
       }
     },
     {
@@ -46,31 +59,59 @@
     {
       "type": "chrome_open_tabs",
       "parameters": {
-        "urls_to_open": ["https://academicpages.github.io/"]
+        "urls_to_open": [
+          "https://academicpages.github.io/"
+        ]
       }
     }
   ],
   "trajectory": "trajectories/e2392362-125e-4f76-a2ee-524b183a3412",
-  "related_apps": ["chrome", "os", "vscode"],
+  "related_apps": [
+    "chrome",
+    "os",
+    "vscode"
+  ],
   "evaluator": {
+    "postconfig": [
+      {
+        "type": "execute",
+        "parameters": {
+          "command": [
+            "python",
+            "-c",
+            "import pyautogui; import time; pyautogui.hotkey('ctrl', 's'); time.sleep(0.5);"
+          ]
+        }
+      }
+    ],
     "func": "check_json",
-    "options": {"is_yaml": true},
+    "options": {
+      "is_yaml": true
+    },
     "expected": {
       "type": "rule",
       "rules": {
         "expect": [
           {
-            "key": ["name"],
+            "key": [
+              "name"
+            ],
             "method": "eq",
             "ref": "Test Account"
           },
           {
-            "key": ["author", "name"],
+            "key": [
+              "author",
+              "name"
+            ],
             "method": "eq",
             "ref": "Test Account"
           },
           {
-            "key": ["author", "email"],
+            "key": [
+              "author",
+              "email"
+            ],
             "method": "eq",
             "ref": "Test@gmail.com"
           }
@@ -83,4 +124,4 @@
       "dest": "_config.yaml"
     }
   }
-}
+}
\ No newline at end of file
diff --git a/evaluation_examples/settings/googledrive/credentials.json b/evaluation_examples/settings/googledrive/credentials.json
index c4b0d33..81d22c2 100644
--- a/evaluation_examples/settings/googledrive/credentials.json
+++ b/evaluation_examples/settings/googledrive/credentials.json
@@ -1 +1 @@
-{"access_token": "ya29.a0Ad52N3969wUkQepy6SBOSw9Gjg4-MNPfEUBD3OZpajVfs9wL4DbfImk-5XawHjBkTdCKKBqG5R9XIX6KvvUzQDfB2BwVwb0MfLfLJDLALia7MRdPn4j6GAES372u3bSqJNNPMwVZA9j-THb3o5svJiKcJgwcoFKeKC_xaCgYKAScSARISFQHGX2MioJPeGh_8OM6z1_BujwRe3Q0171", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-08T17:16:15Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N3969wUkQepy6SBOSw9Gjg4-MNPfEUBD3OZpajVfs9wL4DbfImk-5XawHjBkTdCKKBqG5R9XIX6KvvUzQDfB2BwVwb0MfLfLJDLALia7MRdPn4j6GAES372u3bSqJNNPMwVZA9j-THb3o5svJiKcJgwcoFKeKC_xaCgYKAScSARISFQHGX2MioJPeGh_8OM6z1_BujwRe3Q0171", "expires_in": 3599, "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
\ No newline at end of file
+{"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0eVpYfdSAjvbCCgYIARAAGA4SNwF-L9IrAgL6KVceiEVTjtQdmPki2I3m8ejP3lzTLL2Wa3-rdrYfU7eYeKDVCS5KRxa_xCE_pPY", "token_expiry": "2024-03-13T10:09:01Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0Ad52N382_JIl2nZBNpJCgoU3HXk2Kz7CArVYn_PGI8pXFucAozry1Vmp5QolzGrnl4UChZswJDOgcdPm5Ew-NbdHPX95wxknoG1oJKqjWYtjl3mw433hiGtriuKWKnXcz1NUf8ewqqq458tJLLDhbbZFW7eZRQrdJzmrGAaCgYKAZ4SARISFQHGX2Mik2MQ5qx0goIypVyzbcUmYw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
\ No newline at end of file
diff --git a/evaluation_examples/test_all.json b/evaluation_examples/test_all.json
new file mode 100644
index 0000000..7153d86
--- /dev/null
+++ b/evaluation_examples/test_all.json
@@ -0,0 +1,398 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
+    "06fe7178-4491-4589-810f-2e2bc9502122",
+    "e1e75309-3ddb-4d09-92ec-de869c928143",
+    "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
+    "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
+    "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
+    "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
+    "480bcfea-d68f-4aaa-a0a9-2589ef319381",
+    "af630914-714e-4a24-a7bb-f9af687d3b91",
+    "3720f614-37fd-4d04-8a6b-76f54f8c222d",
+    "99146c54-4f37-4ab8-9327-5f3291665e1e",
+    "12086550-11c0-466b-b367-1d9e75b3910e",
+    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
+    "93eabf48-6a27-4cb6-b963-7d5fe1e0d3a9",
+    "ae78f875-5b98-4907-bbb5-9c737fc68c03",
+    "3299584d-8f11-4457-bf4c-ce98f7600250",
+    "030eeff7-b492-4218-b312-701ec99ee0cc",
+    "9656a811-9b5b-4ddf-99c7-5117bcef0626",
+    "fc6d8143-9452-4171-9459-7f515143419a",
+    "a96b564e-dbe9-42c3-9ccf-b4498073438a",
+    "1704f00f-79e6-43a7-961b-cedd3724d5fd",
+    "f3b19d1e-2d48-44e9-b4e1-defcae1a0197",
+    "82bc8d6a-36eb-4d2d-8801-ef714fb1e55a",
+    "47543840-672a-467d-80df-8f7c3b9788c9",
+    "c1fa57f3-c3db-4596-8f09-020701085416",
+    "da46d875-6b82-4681-9284-653b0c7ae241",
+    "6c4c23a1-42a4-43cc-9db1-2f86ff3738cc",
+    "f79439ad-3ee8-4f99-a518-0eb60e5652b0",
+    "b7895e80-f4d1-4648-bee0-4eb45a6f1fa8",
+    "9f3f70fc-5afc-4958-a7b7-3bb4fcb01805",
+    "7f52cab9-535c-4835-ac8c-391ee64dc930",
+    "82279c77-8fc6-46f6-9622-3ba96f61b477",
+    "2888b4e6-5b47-4b57-8bf5-c73827890774",
+    "b4f95342-463e-4179-8c3f-193cd7241fb2",
+    "f5d96daf-83a8-4c86-9686-bada31fc66ab",
+    "121ba48f-9e17-48ce-9bc6-a4fb17a7ebba",
+    "368d9ba4-203c-40c1-9fa3-da2f1430ce63",
+    "59155008-fe71-45ec-8a8f-dc35497b6aa8",
+    "a728a36e-8bf1-4bb6-9a03-ef039a5233f0",
+    "b070486d-e161-459b-aa2b-ef442d973b92",
+    "0d8b7de3-e8de-4d86-b9fd-dd2dce58a217",
+    "9f935cce-0a9f-435f-8007-817732bfc0a5",
+    "f0b971a1-6831-4b9b-a50e-22a6e47f45ba",
+    "cabb3bae-cccb-41bd-9f5d-0f3a9fecd825"
+  ],
+  "gimp": [
+    "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
+    "554785e9-4523-4e7a-b8e1-8016f565f56a",
+    "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
+    "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
+    "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
+    "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
+    "b148e375-fe0b-4bec-90e7-38632b0d73c2",
+    "a746add2-cab0-4740-ac36-c3769d9bfb46",
+    "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
+    "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
+    "06ca5602-62ca-47f6-ad4f-da151cde54cc",
+    "e2dd0213-26db-4349-abe5-d5667bfd725c",
+    "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
+    "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
+    "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
+    "734d6579-c07d-47a8-9ae2-13339795476b",
+    "e19bd559-633b-4b02-940f-d946248f088e",
+    "38f48d40-764e-4e77-a7cf-51dfce880291",
+    "fbb548ca-c2a6-4601-9204-e39a2efc507b",
+    "5ca86c6f-f317-49d8-b6a7-b527541caae8",
+    "62f7fd55-0687-4a43-b6e1-3eda16fc6252",
+    "8ea73f6f-9689-42ad-8c60-195bbf06a7ba",
+    "58d3eeeb-e9d0-499f-962e-fd0db2a744d8",
+    "2e6f678f-472d-4c55-99cc-8e7c5c402a71",
+    "045bf3ff-9077-4b86-b483-a1040a949cff",
+    "dbbf4b99-2253-4b10-9274-45f246af2466"
+  ],
+  "libreoffice_calc": [
+    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "42e0a640-4f19-4b28-973d-729602b5a4a7",
+    "51719eea-10bc-4246-a428-ac7c433dd4b3",
+    "1954cced-e748-45c4-9c26-9855b97fbc5e",
+    "2bd59342-0664-4ccb-ba87-79379096cc08",
+    "3aaa4e37-dc91-482e-99af-132a612d40f3",
+    "1273e544-688f-496b-8d89-3e0f40aa0606",
+    "12382c62-0cd1-4bf2-bdc8-1d20bf9b2371",
+    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
+    "535364ea-05bd-46ea-9937-9f55c68507e8",
+    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
+    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c",
+    "04d9aeaf-7bed-4024-bedb-e10e6f00eb7f",
+    "0bf05a7d-b28b-44d2-955a-50b41e24012a",
+    "6054afcb-5bab-4702-90a0-b259b5d3217c",
+    "abed40dc-063f-4598-8ba5-9fe749c0615d",
+    "37608790-6147-45d0-9f20-1137bb35703d",
+    "26a8440e-c166-4c50-aef4-bfb77314b46b",
+    "d681960f-7bc3-4286-9913-a8812ba3261a",
+    "035f41ba-6653-43ab-aa63-c86d449d62e5",
+    "7efeb4b1-3d19-4762-b163-63328d66303b",
+    "1de60575-bb6e-4c3d-9e6a-2fa699f9f197",
+    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
+    "51b11269-2ca8-4b2a-9163-f21758420e78",
+    "1e8df695-bd1b-45b3-b557-e7d599cf7597",
+    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
+    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
+    "a01fbce3-2793-461f-ab86-43680ccbae25",
+    "0326d92d-d218-48a8-9ca1-981cd6d064c7",
+    "0a2e43bf-b26c-4631-a966-af9dfa12c9e5",
+    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
+    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
+    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
+    "1d17d234-e39d-4ed7-b46f-4417922a4e7c",
+    "4e6fcf72-daf3-439f-a232-c434ce416af6",
+    "01b269ae-2111-4a07-81fd-3fcd711993b0",
+    "21df9241-f8d7-4509-b7f1-37e501a823f7",
+    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
+    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
+    "7a4e4bc8-922c-4c84-865c-25ba34136be1",
+    "4de54231-e4b5-49e3-b2ba-61a0bec721c0",
+    "30e3e107-1cfb-46ee-a755-2cd080d7ba6a",
+    "4172ea6e-6b77-4edb-a9cc-c0014bd1603b",
+    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
+    "3a7c8185-25c1-4941-bd7b-96e823c9f21f",
+    "21ab7b40-77c2-4ae6-8321-e00d3a086c73"
+  ],
+  "libreoffice_impress": [
+    "5d901039-a89c-4bfb-967b-bf66f4df075e",
+    "550ce7e7-747b-495f-b122-acdc4d0b8e54",
+    "455d3c66-7dc6-4537-a39a-36d3e9119df7",
+    "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
+    "c59742c0-4323-4b9d-8a02-723c251deaa0",
+    "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
+    "9ec204e4-f0a3-42f8-8458-b772a6797cab",
+    "0f84bef9-9790-432e-92b7-eece357603fb",
+    "ce88f674-ab7a-43da-9201-468d38539e4a",
+    "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
+    "a097acff-6266-4291-9fbd-137af7ecd439",
+    "bf4e9888-f10f-47af-8dba-76413038b73c",
+    "21760ecb-8f62-40d2-8d85-0cee5725cb72",
+    "ac9bb6cb-1888-43ab-81e4-a98a547918cd",
+    "2cd43775-7085-45d8-89fa-9e35c0a915cf",
+    "358aa0a7-6677-453f-ae35-e440f004c31e",
+    "a669ef01-ded5-4099-9ea9-25e99b569840",
+    "73c99fb9-f828-43ce-b87a-01dc07faa224",
+    "15aece23-a215-4579-91b4-69eec72e18da",
+    "986fc832-6af2-417c-8845-9272b3a1528b",
+    "a434992a-89df-4577-925c-0c58b747f0f4",
+    "7dbc52a6-11e0-4c9a-a2cb-1e36cfda80d8",
+    "841b50aa-df53-47bd-a73a-22d3a9f73160",
+    "8979838c-54a5-4454-a2b8-3d135a1a5c8f",
+    "b8adbc24-cef2-4b15-99d5-ecbe7ff445eb",
+    "2b94c692-6abb-48ae-ab0b-b3e8a19cb340",
+    "9cf05d24-6bd9-4dae-8967-f67d88f5d38a",
+    "08aced46-45a2-48d7-993b-ed3fb5b32302",
+    "edb61b14-a854-4bf5-a075-c8075c11293a",
+    "c82632a4-56b6-4db4-9dd1-3820ee3388e4",
+    "39be0d19-634d-4475-8768-09c130f5425d",
+    "ac1b39ff-ee4d-4483-abce-c117e98942f0",
+    "f23acfd2-c485-4b7c-a1e7-d4303ddfe864",
+    "70bca0cc-c117-427e-b0be-4df7299ebeb6",
+    "af2d657a-e6b3-4c6a-9f67-9e3ed015974c",
+    "57667013-ea97-417c-9dce-2713091e6e2a",
+    "0a211154-fda0-48d0-9274-eaac4ce5486d",
+    "a53f80cd-4a90-4490-8310-097b011433f6",
+    "7ae48c60-f143-4119-b659-15b8f485eb9a",
+    "5cfb9197-e72b-454b-900e-c06b0c802b40",
+    "05dd4c1d-c489-4c85-8389-a7836c4f0567",
+    "5c1a6c3d-c1b3-47cb-9b01-8d1b7544ffa1",
+    "4ed5abd0-8b5d-47bd-839f-cacfa15ca37a",
+    "e4ef0baf-4b52-4590-a47e-d4d464cca2d7",
+    "ed43c15f-00cb-4054-9c95-62c880865d68",
+    "3161d64e-3120-47b4-aaad-6a764a92493b",
+    "04578141-1d42-4146-b9cf-6fab4ce5fd74"
+  ],
+  "libreoffice_writer": [
+    "0810415c-bde4-4443-9047-d5f70165a697",
+    "0a0faba3-5580-44df-965d-f562a99b291c",
+    "0b17a146-2934-46c7-8727-73ff6b6483e8",
+    "0e47de2a-32e0-456c-a366-8c607ef7a9d2",
+    "0e763496-b6bb-4508-a427-fad0b6c3e195",
+    "3ef2b351-8a84-4ff2-8724-d86eae9b842e",
+    "4bcb1253-a636-4df4-8cb0-a35c04dfef31",
+    "66399b0d-8fda-4618-95c4-bfc6191617e9",
+    "6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2",
+    "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
+    "6f81754e-285d-4ce0-b59e-af7edb02d108",
+    "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
+    "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
+    "88fe4b2d-3040-4c70-9a70-546a47764b48",
+    "936321ce-5236-426a-9a20-e0e3c5dc536f",
+    "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
+    "b21acd93-60fd-4127-8a43-2f5178f4a830",
+    "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
+    "e246f6d8-78d7-44ac-b668-fcf47946cb50",
+    "e528b65e-1107-4b8c-8988-490e4fece599",
+    "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
+    "f178a4a9-d090-4b56-bc4c-4b72a61a035d",
+    "bb8ccc78-479f-4a2f-a71e-d565e439436b"
+  ],
+  "multi_apps": [
+    "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
+    "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
+    "2fe4b718-3bd7-46ec-bdce-b184f5653624",
+    "3680a5ee-6870-426a-a997-eba929a0d25c",
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "510f64c8-9bcc-4be1-8d30-638705850618",
+    "51f5801c-18b3-4f25-b0c3-02f85507a078",
+    "58565672-7bfe-48ab-b828-db349231de6b",
+    "78aed49a-a710-4321-a793-b611a7c5b56b",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "937087b6-f668-4ba6-9110-60682ee33441",
+    "a0b9dc9c-fc07-4a88-8c5d-5e3ecad91bcb",
+    "b52b40a5-ad70-4c53-b5b0-5650a8387052",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618",
+    "d9b7c649-c975-4f53-88f5-940b29c47247",
+    "e135df7c-7687-4ac0-a5f0-76b74438b53e",
+    "ee9a3c83-f437-4879-8918-be5efbb9fac7",
+    "f7dfbef3-7697-431c-883a-db8583a4e4f9",
+    "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
+    "6d72aad6-187a-4392-a4c4-ed87269c51cf",
+    "f918266a-b3e0-4914-865d-4faa564f1aef",
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "bc2b57f3-686d-4ec9-87ce-edf850b7e442",
+    "74d5859f-ed66-4d3e-aa0e-93d7a592ce41",
+    "b5062e3e-641c-4e3a-907b-ac864d2e7652",
+    "00fa164e-2612-4439-992e-157d019a8436",
+    "acb0f96b-e27c-44d8-b55f-7cb76609dfcd",
+    "69acbb55-d945-4927-a87b-8480e1a5bb7e",
+    "48d05431-6cd5-4e76-82eb-12b60d823f7d",
+    "68a25bd4-59c7-4f4d-975e-da0c8509c848",
+    "eb303e01-261e-4972-8c07-c9b4e7a4922a",
+    "0c825995-5b70-4526-b663-113f4c999dd2",
+    "c7c1e4c3-9e92-4eba-a4b8-689953975ea4",
+    "d1acdb87-bb67-4f30-84aa-990e56a09c92",
+    "deec51c9-3b1e-4b9e-993c-4776f20e8bb2",
+    "8e116af7-7db7-4e35-a68b-b0939c066c78",
+    "337d318b-aa07-4f4f-b763-89d9a2dd013f",
+    "82e3c869-49f6-4305-a7ce-f3e64a0618e7",
+    "185f29bd-5da0-40a6-b69c-ba7f4e0324ef",
+    "869de13e-bef9-4b91-ba51-f6708c40b096",
+    "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e",
+    "3a93cae4-ad3e-403e-8c12-65303b271818",
+    "1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+    "26150609-0da3-4a7d-8868-0faf9c5f01bb",
+    "9219480b-3aed-47fc-8bac-d2cffc5849f7",
+    "881deb30-9549-4583-a841-8270c65f2a17",
+    "7e287123-70ca-47b9-8521-47db09b69b14",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "5bc63fb9-276a-4439-a7c1-9dc76401737f",
+    "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
+    "a82b78bb-7fde-4cb3-94a4-035baf10bcf0",
+    "36037439-2044-4b50-b9d1-875b5a332143",
+    "716a6079-22da-47f1-ba73-c9d58f986a38",
+    "873cafdd-a581-47f6-8b33-b9696ddb7b05",
+    "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a",
+    "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a",
+    "da922383-bfa4-4cd3-bbad-6bebab3d7742",
+    "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
+    "81c425f5-78f3-4771-afd6-3d2973825947",
+    "bb83cab4-e5c7-42c7-a67b-e46068032b86",
+    "227d2f97-562b-4ccb-ae47-a5ec9e142fbb",
+    "b337d106-053f-4d37-8da0-7f9c4043a66b",
+    "20236825-b5df-46e7-89bf-62e1d640a897",
+    "8df7e444-8e06-4f93-8a1a-c5c974269d82",
+    "aad10cd7-9337-4b62-b704-a857848cedf2",
+    "02ce9a50-7af2-47ed-8596-af0c230501f8",
+    "4c26e3f3-3a14-4d86-b44a-d3cedebbb487",
+    "a503b07f-9119-456b-b75d-f5146737d24f",
+    "09a37c51-e625-49f4-a514-20a773797a8a",
+    "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "f5c13cdd-205c-4719-a562-348ae5cd1d91",
+    "5990457f-2adb-467b-a4af-5c857c92d762",
+    "415ef462-bed3-493a-ac36-ca8c6d23bf1b",
+    "7ff48d5b-2df2-49da-b500-a5150ffc7f18",
+    "9f3bb592-209d-43bc-bb47-d77d9df56504",
+    "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0",
+    "ce2b64a2-ddc1-4f91-8c7d-a88be7121aac",
+    "3f05f3b9-29ba-4b6b-95aa-2204697ffc06",
+    "e1fc0df3-c8b9-4ee7-864c-d0b590d3aa56",
+    "f8369178-fafe-40c2-adc4-b9b08a125456",
+    "778efd0a-153f-4842-9214-f05fc176b877",
+    "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5",
+    "c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
+    "788b3701-3ec9-4b67-b679-418bfa726c22",
+    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
+    "42d25c08-fb87-4927-8b65-93631280a26f",
+    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
+    "e8172110-ec08-421b-a6f5-842e6451911f",
+    "42f4d1c7-4521-4161-b646-0a8934e36081",
+    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
+    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
+    "91190194-f406-4cd6-b3f9-c43fac942b22",
+    "7f35355e-02a6-45b5-b140-f0be698bcf85",
+    "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
+    "0e5303d4-8820-42f6-b18d-daf7e633de21",
+    "df67aebb-fb3a-44fd-b75b-51b6012df509",
+    "5df7b33a-9f77-4101-823e-02f863e1c1ae",
+    "aceb0368-56b8-4073-b70e-3dc9aee184e0",
+    "22a4636f-8179-4357-8e87-d1743ece1f81",
+    "236833a3-5704-47fc-888c-4f298f09f799",
+    "67890eb6-6ce5-4c00-9e3d-fb4972699b06"
+  ],
+  "os": [
+    "94d95f96-9699-4208-98ba-3c3119edf9c2",
+    "bedcedc4-4d72-425e-ad62-21960b11fe0d",
+    "43c2d64c-bab5-4dcb-a30c-b888321c319a",
+    "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
+    "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
+    "a462a795-fdc7-4b23-b689-e8b6df786b78",
+    "f9be0997-4b7c-45c5-b05c-4612b44a6118",
+    "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
+    "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "e0df059f-28a6-4169-924f-b9623e7184cc",
+    "ddc75b62-7311-4af8-bfb3-859558542b36",
+    "b6781586-6346-41cd-935a-a6b1487918fc",
+    "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa",
+    "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
+    "fe41f596-a71b-4c2f-9b2f-9dcd40b568c3",
+    "a4d98375-215b-4a4d-aee9-3d4370fccc41",
+    "13584542-872b-42d8-b299-866967b5c3ef",
+    "23393935-50c7-4a86-aeea-2b78fd089c5c",
+    "5812b315-e7bd-4265-b51f-863c02174c28",
+    "c288e301-e626-4b98-a1ab-159dcb162af5",
+    "cc9d4f34-1ca0-4a1b-8ff2-09302696acb9",
+    "c56de254-a3ec-414e-81a6-83d2ce8c41fa",
+    "4783cc41-c03c-4e1b-89b4-50658f642bd5",
+    "5c1075ca-bb34-46a3-a7a0-029bd7463e79",
+    "5ced85fc-fa1a-4217-95fd-0fb530545ce2",
+    "37887e8c-da15-4192-923c-08fa390a176d",
+    "4127319a-8b79-4410-b58a-7a151e15f3d7",
+    "4d117223-a354-47fb-8b45-62ab1390a95f",
+    "6f56bf42-85b8-4fbb-8e06-6c44960184ba"
+  ],
+  "thunderbird": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
+    "12086550-11c0-466b-b367-1d9e75b3910e",
+    "06fe7178-4491-4589-810f-2e2bc9502122",
+    "6766f2b8-8a72-417f-a9e5-56fcaa735837",
+    "e1e75309-3ddb-4d09-92ec-de869c928143",
+    "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
+    "35253b65-1c19-4304-8aa4-6884b8218fc0",
+    "d088f539-cab4-4f9a-ac92-9999fc3a656e",
+    "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
+    "480bcfea-d68f-4aaa-a0a9-2589ef319381",
+    "030eeff7-b492-4218-b312-701ec99ee0cc",
+    "94760984-3ff5-41ee-8347-cf1af709fea0",
+    "99146c54-4f37-4ab8-9327-5f3291665e1e",
+    "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
+    "8f080098-ddb1-424c-b438-4e96e5e4786e",
+    "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
+    "fba2c100-79e8-42df-ae74-b592418d54f4",
+    "efcf0d81-0835-4880-b2fd-d866e8bc2294",
+    "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
+    "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
+    "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
+    "9195653c-f4aa-453d-aa95-787f6ccfaae9",
+    "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
+    "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
+    "5ac2891a-eacd-4954-b339-98abba077adb",
+    "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
+    "215dfd39-f493-4bc3-a027-8a97d72c61bf",
+    "cb130f0d-d36f-4302-9838-b3baf46139b6",
+    "7882ed6e-bece-4bf0-bada-c32dc1ddae72"
+  ],
+  "vs_code": [
+    "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
+    "eabc805a-bfcf-4460-b250-ac92135819f6",
+    "982d12a5-beab-424f-8d38-d2a48429e511",
+    "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
+    "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
+    "9439a27b-18ae-42d8-9778-5f68f891805e",
+    "ae506c68-352c-4094-9caa-ee9d42052317",
+    "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
+    "930fdb3b-11a8-46fe-9bac-577332e2640e",
+    "276cc624-87ea-4f08-ab93-f770e3790175",
+    "9d425400-e9b2-4424-9a4b-d4c7abac4140",
+    "5e2d93d8-8ad0-4435-b150-1692aacaa994",
+    "6ed0a554-cbee-4b44-84ea-fd6c042f4fe1",
+    "ec71221e-ac43-46f9-89b8-ee7d80f7e1c5",
+    "70745df8-f2f5-42bd-8074-fbc10334fcc5",
+    "57242fad-77ca-454f-b71b-f187181a9f23",
+    "c6bf789c-ba3a-4209-971d-b63abf0ab733",
+    "0512bb38-d531-4acf-9e7e-0add90816068",
+    "847a96b6-df94-4927-97e6-8cc9ea66ced7",
+    "7aeae0e2-70ee-4705-821d-1bba5d5b2ddd",
+    "dcbe20e8-647f-4f1d-8696-f1c5bbb570e3",
+    "7c4cc09e-7a92-40dd-8338-b2286535c4ed",
+    "971cbb5b-3cbf-4ff7-9e24-b5c84fcebfa6"
+  ]
+}
\ No newline at end of file
diff --git a/evaluation_examples/test_small.json b/evaluation_examples/test_small.json
new file mode 100644
index 0000000..4c1feb7
--- /dev/null
+++ b/evaluation_examples/test_small.json
@@ -0,0 +1,102 @@
+{
+  "chrome": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
+  ],
+  "gimp": [
+    "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
+    "554785e9-4523-4e7a-b8e1-8016f565f56a"
+  ],
+  "libreoffice_calc": [
+    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
+    "42e0a640-4f19-4b28-973d-729602b5a4a7"
+  ],
+  "libreoffice_impress": [
+    "5d901039-a89c-4bfb-967b-bf66f4df075e",
+    "550ce7e7-747b-495f-b122-acdc4d0b8e54"
+  ],
+  "libreoffice_writer": [
+    "0810415c-bde4-4443-9047-d5f70165a697",
+    "0a0faba3-5580-44df-965d-f562a99b291c"
+  ],
+  "multi_apps": [
+    "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
+    "46407397-a7d5-4c6b-92c6-dbe038b1457b",
+    "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
+    "510f64c8-9bcc-4be1-8d30-638705850618",
+    "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
+    "c867c42d-a52d-4a24-8ae3-f75d256b5618",
+    "e135df7c-7687-4ac0-a5f0-76b74438b53e",
+    "f7dfbef3-7697-431c-883a-db8583a4e4f9",
+    "6d72aad6-187a-4392-a4c4-ed87269c51cf",
+    "f918266a-b3e0-4914-865d-4faa564f1aef",
+    "da52d699-e8d2-4dc5-9191-a2199e0b6a9b",
+    "74d5859f-ed66-4d3e-aa0e-93d7a592ce41",
+    "b5062e3e-641c-4e3a-907b-ac864d2e7652",
+    "48d05431-6cd5-4e76-82eb-12b60d823f7d",
+    "eb303e01-261e-4972-8c07-c9b4e7a4922a",
+    "d1acdb87-bb67-4f30-84aa-990e56a09c92",
+    "deec51c9-3b1e-4b9e-993c-4776f20e8bb2",
+    "8e116af7-7db7-4e35-a68b-b0939c066c78",
+    "185f29bd-5da0-40a6-b69c-ba7f4e0324ef",
+    "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e",
+    "3a93cae4-ad3e-403e-8c12-65303b271818",
+    "1f18aa87-af6f-41ef-9853-cdb8f32ebdea",
+    "26150609-0da3-4a7d-8868-0faf9c5f01bb",
+    "7e287123-70ca-47b9-8521-47db09b69b14",
+    "e2392362-125e-4f76-a2ee-524b183a3412",
+    "26660ad1-6ebb-4f59-8cba-a8432dfe8d38",
+    "a82b78bb-7fde-4cb3-94a4-035baf10bcf0",
+    "36037439-2044-4b50-b9d1-875b5a332143",
+    "716a6079-22da-47f1-ba73-c9d58f986a38",
+    "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a",
+    "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a",
+    "da922383-bfa4-4cd3-bbad-6bebab3d7742",
+    "2373b66a-092d-44cb-bfd7-82e86e7a3b4d",
+    "81c425f5-78f3-4771-afd6-3d2973825947",
+    "227d2f97-562b-4ccb-ae47-a5ec9e142fbb",
+    "20236825-b5df-46e7-89bf-62e1d640a897",
+    "02ce9a50-7af2-47ed-8596-af0c230501f8",
+    "4c26e3f3-3a14-4d86-b44a-d3cedebbb487",
+    "09a37c51-e625-49f4-a514-20a773797a8a",
+    "3e3fc409-bff3-4905-bf16-c968eee3f807",
+    "415ef462-bed3-493a-ac36-ca8c6d23bf1b",
+    "9f3bb592-209d-43bc-bb47-d77d9df56504",
+    "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0",
+    "3f05f3b9-29ba-4b6b-95aa-2204697ffc06",
+    "f8369178-fafe-40c2-adc4-b9b08a125456",
+    "778efd0a-153f-4842-9214-f05fc176b877",
+    "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5",
+    "c2751594-0cd5-4088-be1b-b5f2f9ec97c4",
+    "48c46dc7-fe04-4505-ade7-723cba1aa6f6",
+    "42d25c08-fb87-4927-8b65-93631280a26f",
+    "bb7db4c2-30b5-4be7-8dd7-b8c4ec7d3108",
+    "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f",
+    "d68204bf-11c1-4b13-b48b-d303c73d4bf6",
+    "91190194-f406-4cd6-b3f9-c43fac942b22",
+    "7f35355e-02a6-45b5-b140-f0be698bcf85",
+    "98e8e339-5f91-4ed2-b2b2-12647cb134f4",
+    "df67aebb-fb3a-44fd-b75b-51b6012df509",
+    "5df7b33a-9f77-4101-823e-02f863e1c1ae",
+    "22a4636f-8179-4357-8e87-d1743ece1f81",
+    "236833a3-5704-47fc-888c-4f298f09f799"
+  ],
+  "os": [
+    "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
+    "5812b315-e7bd-4265-b51f-863c02174c28",
+    "43c2d64c-bab5-4dcb-a30c-b888321c319a",
+    "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82"
+  ],
+  "thunderbird": [
+    "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
+    "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
+  ],
+  "vlc": [
+    "59f21cfb-0120-4326-b255-a5b827b38967",
+    "8f080098-ddb1-424c-b438-4e96e5e4786e"
+  ],
+  "vs_code": [
+    "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
+    "53ad5833-3455-407b-bbc6-45b4c79ab8fb"
+  ]
+}
\ No newline at end of file
diff --git a/experiment_a11y_tree.py b/experiment_a11y_tree.py
deleted file mode 100644
index c441bd0..0000000
--- a/experiment_a11y_tree.py
+++ /dev/null
@@ -1,432 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model="gpt-4-0125-preview"):
-    action_space = "pyautogui"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], max_tokens=1000,
-                        action_space=action_space, exp="a11y_tree")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="a11y_tree")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "a11y_tree", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    os_list = [
-        "94d95f96-9699-4208-98ba-3c3119edf9c2",
-        "bedcedc4-4d72-425e-ad62-21960b11fe0d",
-        "43c2d64c-bab5-4dcb-a30c-b888321c319a",
-        "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
-        "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
-        "f9be0997-4b7c-45c5-b05c-4612b44a6118",
-        "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
-        "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
-        "e0df059f-28a6-4169-924f-b9623e7184cc",
-        "ddc75b62-7311-4af8-bfb3-859558542b36",
-        "b6781586-6346-41cd-935a-a6b1487918fc",
-        "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
-        "a4d98375-215b-4a4d-aee9-3d4370fccc41",
-        "13584542-872b-42d8-b299-866967b5c3ef",
-        "23393935-50c7-4a86-aeea-2b78fd089c5c"
-    ]
-
-    # for example_id in os_list:
-    #     try:
-    #         main("os", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vlc_list = [
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8f080098-ddb1-424c-b438-4e96e5e4786e",
-        "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
-        "fba2c100-79e8-42df-ae74-b592418d54f4",
-        "efcf0d81-0835-4880-b2fd-d866e8bc2294",
-        "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
-        "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
-        "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
-        "9195653c-f4aa-453d-aa95-787f6ccfaae9",
-        "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
-        "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
-        "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
-        "215dfd39-f493-4bc3-a027-8a97d72c61bf"
-    ]
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-
-    calc_list = [
-        "eb03d19a-b88d-4de4-8a64-ca0ac66f426b",
-        "0bf05a7d-b28b-44d2-955a-50b41e24012a",
-        "7a4e4bc8-922c-4c84-865c-25ba34136be1",
-        "2bd59342-0664-4ccb-ba87-79379096cc08",
-        "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-        "7efeb4b1-3d19-4762-b163-63328d66303b",
-        "4e6fcf72-daf3-439f-a232-c434ce416af6",
-        "6054afcb-5bab-4702-90a0-b259b5d3217c",
-        "abed40dc-063f-4598-8ba5-9fe749c0615d",
-        "01b269ae-2111-4a07-81fd-3fcd711993b0",
-        "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-        "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-        "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-        "51b11269-2ca8-4b2a-9163-f21758420e78",
-        "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-        "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-        "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-        "3aaa4e37-dc91-482e-99af-132a612d40f3",
-        "37608790-6147-45d0-9f20-1137bb35703d",
-        "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-        "d681960f-7bc3-4286-9913-a8812ba3261a",
-        "21df9241-f8d7-4509-b7f1-37e501a823f7",
-        "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-        "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-        "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-        "a01fbce3-2793-461f-ab86-43680ccbae25",
-        "4f07fbe9-70de-4927-a4d5-bb28bc12c52c",
-    ]
-
-    # for example_id in calc_list:
-    #     main("libreoffice_calc", example_id)
-
-    impress_list = [
-        "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        "0f84bef9-9790-432e-92b7-eece357603fb",
-        "ce88f674-ab7a-43da-9201-468d38539e4a",
-        "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        "a097acff-6266-4291-9fbd-137af7ecd439",
-        "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    thunderbird_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-    # for example_id in thunderbird_list:
-    #     main("thunderbird", example_id)
-
-    gimp_list = [
-        "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
-        "554785e9-4523-4e7a-b8e1-8016f565f56a",
-        "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
-        "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
-        "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
-        "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
-        "b148e375-fe0b-4bec-90e7-38632b0d73c2",
-        "a746add2-cab0-4740-ac36-c3769d9bfb46",
-        "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
-        "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
-        "06ca5602-62ca-47f6-ad4f-da151cde54cc",
-        "e2dd0213-26db-4349-abe5-d5667bfd725c",
-        "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
-        "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
-        "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
-        "734d6579-c07d-47a8-9ae2-13339795476b"
-    ]
-
-    # for example_id in gimp_list:
-    #     try:
-    #         main("gimp", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    from tqdm import tqdm
-
-    # for example_id in tqdm(vlc_list):
-    #     try:
-    #         main("vlc", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    # for example_id in tqdm(chrome_list):
-    #     try:
-    #         main("chrome", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    vs_code_list = [
-        # "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        # "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        # "eabc805a-bfcf-4460-b250-ac92135819f6",
-        # "982d12a5-beab-424f-8d38-d2a48429e511",
-        # "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        # "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        # "9439a27b-18ae-42d8-9778-5f68f891805e",
-        # "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        # "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        # "276cc624-87ea-4f08-ab93-f770e3790175",
-        # "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    for example_id in tqdm(vs_code_list):
-        try:
-            main("vs_code", example_id, gpt4_model="gpt-3.5-turbo-16k")
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    thunderbird_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-
-    # for example_id in tqdm(thunderbird_list):
-    #     try:
-    #         main("thunderbird", example_id, gpt4_model="gpt-3.5-turbo-16k")
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    multiple_list = [
-        # "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-        # "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-        "2fe4b718-3bd7-46ec-bdce-b184f5653624",
-        "3680a5ee-6870-426a-a997-eba929a0d25c",
-        # "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-        # "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-        # "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-        # "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-        # "51f5801c-18b3-4f25-b0c3-02f85507a078",
-        "58565672-7bfe-48ab-b828-db349231de6b",
-        # "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-        # "510f64c8-9bcc-4be1-8d30-638705850618",
-        # "937087b6-f668-4ba6-9110-60682ee33441",
-        # "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        # "3680a5ee-6870-426a-a997-eba929a0d25c",
-        # "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-        "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        # "58565672-7bfe-48ab-b828-db349231de6b",
-        # "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    ]
-
-    for example_id in multiple_list:
-        try:
-            main("multi_apps", example_id, gpt4_model="gpt-3.5-turbo-16k")
-        except Exception as e:
-            logger.error("An error occurred while running the example: %s", e)
-            continue
-
diff --git a/experiment_screenshot.py b/experiment_screenshot.py
deleted file mode 100644
index 4a4ccde..0000000
--- a/experiment_screenshot.py
+++ /dev/null
@@ -1,306 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-import time
-import func_timeout
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-# from mm_agents.gemini_pro_agent import GeminiPro_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model = "gpt-4-vision-preview"):
-    action_space = "pyautogui"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space,
-                        exp="screenshot")
-    #
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space, exp="screenshot")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "screenshot", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    chrome_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        # "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        # "06fe7178-4491-4589-810f-2e2bc9502122",
-        # "e1e75309-3ddb-4d09-92ec-de869c928143",
-        # "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        # "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        # "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        # "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        # "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    calc_list = [
-    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
-    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-    "7efeb4b1-3d19-4762-b163-63328d66303b",
-    "4e6fcf72-daf3-439f-a232-c434ce416af6",
-    "6054afcb-5bab-4702-90a0-b259b5d3217c",
-    "abed40dc-063f-4598-8ba5-9fe749c0615d",
-    "01b269ae-2111-4a07-81fd-3fcd711993b0",
-    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-    "af2b02f7-acee-4be4-8b66-499fab394915",
-    "da1d63b8-fa12-417b-ba18-f748e5f770f3",
-    "636380ea-d5f6-4474-b6ca-b2ed578a20f1",
-    "5ba77536-05c5-4aae-a9ff-6e298d094c3e",
-    "4bc4eaf4-ca5e-4db2-8138-8d4e65af7c0b",
-    "672a1b02-c62f-4ae2-acf0-37f5fb3052b0",
-    "648fe544-16ba-44af-a587-12ccbe280ea6",
-    "8985d1e4-5b99-4711-add4-88949ebb2308",
-    "9e606842-2e27-43bf-b1d1-b43289c9589b",
-    "fcb6e45b-25c4-4087-9483-03d714f473a9",
-    "68c0c5b7-96f3-4e87-92a7-6c1b967fd2d2",
-    "fff629ea-046e-4793-8eec-1a5a15c3eb35",
-    "5c9a206c-bb00-4fb6-bb46-ee675c187df5",
-    "e975ae74-79bd-4672-8d1c-dc841a85781d",
-    "34a6938a-58da-4897-8639-9b90d6db5391",
-    "b5a22759-b4eb-4bf2-aeed-ad14e8615f19",
-    "2f9913a1-51ed-4db6-bfe0-7e1c95b3139e",
-    "2558031e-401d-4579-8e00-3ecf540fb492",
-    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-    "51b11269-2ca8-4b2a-9163-f21758420e78",
-    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-    "3aaa4e37-dc91-482e-99af-132a612d40f3",
-    "37608790-6147-45d0-9f20-1137bb35703d",
-    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-    "d681960f-7bc3-4286-9913-a8812ba3261a",
-    "21df9241-f8d7-4509-b7f1-37e501a823f7",
-    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-    "a01fbce3-2793-461f-ab86-43680ccbae25",
-    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c"
-]
-    # for example_id in calc_list:
-    #     main("libreoffice_calc", example_id)
-
-    impress_list = [
-        # "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        # "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        # "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        # "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        # "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        # "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        # "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        # "0f84bef9-9790-432e-92b7-eece357603fb",
-        # "ce88f674-ab7a-43da-9201-468d38539e4a",
-        # "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        # "a097acff-6266-4291-9fbd-137af7ecd439",
-        # "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    # gimp_list = [
-    #     "7a4deb26-d57d-4ea9-9a73-630f66a7b568",
-    #     "554785e9-4523-4e7a-b8e1-8016f565f56a",
-    #     "77b8ab4d-994f-43ac-8930-8ca087d7c4b4",
-    #     "f4aec372-4fb0-4df5-a52b-79e0e2a5d6ce",
-    #     "d52d6308-ec58-42b7-a2c9-de80e4837b2b",
-    #     "2a729ded-3296-423d-aec4-7dd55ed5fbb3",
-    #     "b148e375-fe0b-4bec-90e7-38632b0d73c2",
-    #     "a746add2-cab0-4740-ac36-c3769d9bfb46",
-    #     "7b7617bd-57cc-468e-9c91-40c4ec2bcb3d",
-    #     "d16c99dc-2a1e-46f2-b350-d97c86c85c15",
-    #     "06ca5602-62ca-47f6-ad4f-da151cde54cc",
-    #     "e2dd0213-26db-4349-abe5-d5667bfd725c",
-    #     "f723c744-e62c-4ae6-98d1-750d3cd7d79d",
-    #     "72f83cdc-bf76-4531-9a1b-eb893a13f8aa",
-    #     "7767eef2-56a3-4cea-8c9f-48c070c7d65b",
-    #     "734d6579-c07d-47a8-9ae2-13339795476b"
-    # ]
-    #
-    # for example_id in gimp_list:
-    #     try:
-    #         main("gimp", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-    #
-
-    vs_code_list = [
-        # "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        # "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        # "eabc805a-bfcf-4460-b250-ac92135819f6",
-        # "982d12a5-beab-424f-8d38-d2a48429e511",
-        # "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        # "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        # "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    # multiple_list = [
-    #     "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-    #     "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-    #     "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-    #     "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-    #     "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-    #     "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-    #     "51f5801c-18b3-4f25-b0c3-02f85507a078",
-    #     "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-    #     "510f64c8-9bcc-4be1-8d30-638705850618",
-    #     "937087b6-f668-4ba6-9110-60682ee33441",
-    #     "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-    #     "3680a5ee-6870-426a-a997-eba929a0d25c",
-    #     "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-    #     "58565672-7bfe-48ab-b828-db349231de6b",
-    #     "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    # ]
-    #
-    # for example_id in multiple_list:
-    #     try:
-    #         main("multi_apps", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
diff --git a/experiment_screenshot_a11y_tree.py b/experiment_screenshot_a11y_tree.py
deleted file mode 100644
index ffa09f1..0000000
--- a/experiment_screenshot_a11y_tree.py
+++ /dev/null
@@ -1,361 +0,0 @@
-import datetime
-import json
-import logging
-import os
-import sys
-
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu2\Ubuntu2.vmx"
-
-
-# PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx"
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id, gpt4_model="gpt-4-vision-preview"):
-    action_space = "pyautogui"
-    # example_class = "libreoffice_calc"
-    # example_id = "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3"
-    # example_id = "01b269ae-2111-4a07-81fd-3fcd711993b0"
-    gemini_model = "gemini-pro-vision"
-
-    logger.info("Running example %s/%s", example_class, example_id)
-    logger.info("Using model %s", gpt4_model)
-    # logger.info("Using model %s", gemini_model)
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-    # example["snapshot"] = "exp_setup4"
-    # example["snapshot"] = "Snapshot 30"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
-                        action_space=action_space, exp="both")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space, exp="both")
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "both", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    os_list = [
-        "94d95f96-9699-4208-98ba-3c3119edf9c2",
-        "bedcedc4-4d72-425e-ad62-21960b11fe0d",
-        "43c2d64c-bab5-4dcb-a30c-b888321c319a",
-        "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
-        "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3",
-        "f9be0997-4b7c-45c5-b05c-4612b44a6118",
-        "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2",
-        "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57",
-        "e0df059f-28a6-4169-924f-b9623e7184cc",
-        "ddc75b62-7311-4af8-bfb3-859558542b36",
-        "b6781586-6346-41cd-935a-a6b1487918fc",
-        "3ce045a0-877b-42aa-8d2c-b4a863336ab8",
-        "a4d98375-215b-4a4d-aee9-3d4370fccc41",
-        "13584542-872b-42d8-b299-866967b5c3ef",
-        "23393935-50c7-4a86-aeea-2b78fd089c5c"
-    ]
-
-    # for example_id in os_list:
-    #     try:
-    #         main("os", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    calc_list = [
-    "a9f325aa-8c05-4e4f-8341-9e4358565f4f",
-    "ecb0df7a-4e8d-4a03-b162-053391d3afaf",
-    "7efeb4b1-3d19-4762-b163-63328d66303b",
-    "4e6fcf72-daf3-439f-a232-c434ce416af6",
-    "6054afcb-5bab-4702-90a0-b259b5d3217c",
-    "abed40dc-063f-4598-8ba5-9fe749c0615d",
-    "01b269ae-2111-4a07-81fd-3fcd711993b0",
-    "8b1ce5f2-59d2-4dcc-b0b0-666a714b9a14",
-    "af2b02f7-acee-4be4-8b66-499fab394915",
-    "da1d63b8-fa12-417b-ba18-f748e5f770f3",
-    "636380ea-d5f6-4474-b6ca-b2ed578a20f1",
-    "5ba77536-05c5-4aae-a9ff-6e298d094c3e",
-    "4bc4eaf4-ca5e-4db2-8138-8d4e65af7c0b",
-    "672a1b02-c62f-4ae2-acf0-37f5fb3052b0",
-    "648fe544-16ba-44af-a587-12ccbe280ea6",
-    "8985d1e4-5b99-4711-add4-88949ebb2308",
-    "9e606842-2e27-43bf-b1d1-b43289c9589b",
-    "fcb6e45b-25c4-4087-9483-03d714f473a9",
-    "68c0c5b7-96f3-4e87-92a7-6c1b967fd2d2",
-    "fff629ea-046e-4793-8eec-1a5a15c3eb35",
-    "5c9a206c-bb00-4fb6-bb46-ee675c187df5",
-    "e975ae74-79bd-4672-8d1c-dc841a85781d",
-    "34a6938a-58da-4897-8639-9b90d6db5391",
-    "b5a22759-b4eb-4bf2-aeed-ad14e8615f19",
-    "2f9913a1-51ed-4db6-bfe0-7e1c95b3139e",
-    "2558031e-401d-4579-8e00-3ecf540fb492",
-    "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
-    "4188d3a4-077d-46b7-9c86-23e1a036f6c1",
-    "51b11269-2ca8-4b2a-9163-f21758420e78",
-    "7e429b8d-a3f0-4ed0-9b58-08957d00b127",
-    "347ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "6e99a1ad-07d2-4b66-a1ce-ece6d99c20a5",
-    "3aaa4e37-dc91-482e-99af-132a612d40f3",
-    "37608790-6147-45d0-9f20-1137bb35703d",
-    "f9584479-3d0d-4c79-affa-9ad7afdd8850",
-    "d681960f-7bc3-4286-9913-a8812ba3261a",
-    "21df9241-f8d7-4509-b7f1-37e501a823f7",
-    "1334ca3e-f9e3-4db8-9ca7-b4c653be7d17",
-    "357ef137-7eeb-4c80-a3bb-0951f26a8aff",
-    "aa3a8974-2e85-438b-b29e-a64df44deb4b",
-    "a01fbce3-2793-461f-ab86-43680ccbae25",
-    "4f07fbe9-70de-4927-a4d5-bb28bc12c52c"
-]
-
-    # for example_id in calc_list:
-    #     try:
-    #         main("libreoffice_calc", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    impress_list = [
-        "5d901039-a89c-4bfb-967b-bf66f4df075e",
-        "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-        "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-        "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-        "c59742c0-4323-4b9d-8a02-723c251deaa0",
-        "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-        "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-        "0f84bef9-9790-432e-92b7-eece357603fb",
-        "ce88f674-ab7a-43da-9201-468d38539e4a",
-        "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-        "a097acff-6266-4291-9fbd-137af7ecd439",
-        "bf4e9888-f10f-47af-8dba-76413038b73c",
-        "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    ]
-
-    # for example_id in impress_list:
-    #     try:
-    #         main("libreoffice_impress", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    # for example_id in vs_code_list:
-    #     try:
-    #         main("vs_code", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    multiple_list = [
-        "f8cfa149-d1c1-4215-8dac-4a0932bad3c2",
-        "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
-        "4e9f0faf-2ecc-4ae8-a804-28c9a75d1ddc",
-        "b52b40a5-ad70-4c53-b5b0-5650a8387052",
-        "46407397-a7d5-4c6b-92c6-dbe038b1457b",
-        "2b9493d7-49b8-493a-a71b-56cd1f4d6908",
-        "51f5801c-18b3-4f25-b0c3-02f85507a078",
-        "2c9fc0de-3ee7-45e1-a5df-c86206ad78b5",
-        "510f64c8-9bcc-4be1-8d30-638705850618",
-        "937087b6-f668-4ba6-9110-60682ee33441",
-        "ee9a3c83-f437-4879-8918-be5efbb9fac7",
-        "3680a5ee-6870-426a-a997-eba929a0d25c",
-        "e135df7c-7687-4ac0-a5f0-76b74438b53e",
-        "58565672-7bfe-48ab-b828-db349231de6b",
-        "2fe4b718-3bd7-46ec-bdce-b184f5653624"
-    ]
-
-    # for example_id in multiple_list:
-    #     try:
-    #         main("multi_apps", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-    chrome_list = [
-        # "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-
-    # for example_id in chrome_list:
-    #     try:
-    #         main("chrome", example_id)
-    #     except Exception as e:
-    #         logger.error("An error occurred while running the example: %s", e)
-    #         continue
-
-
-    writer_list = [
-        "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
-        "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
-        "0e47de2a-32e0-456c-a366-8c607ef7a9d2",
-        "4bcb1253-a636-4df4-8cb0-a35c04dfef31",
-        "0810415c-bde4-4443-9047-d5f70165a697",
-        "e528b65e-1107-4b8c-8988-490e4fece599",
-        "66399b0d-8fda-4618-95c4-bfc6191617e9",
-        "936321ce-5236-426a-9a20-e0e3c5dc536f",
-        "3ef2b351-8a84-4ff2-8724-d86eae9b842e",
-        "0b17a146-2934-46c7-8727-73ff6b6483e8",
-        "0e763496-b6bb-4508-a427-fad0b6c3e195",
-        "f178a4a9-d090-4b56-bc4c-4b72a61a035d",
-        "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
-        "0a0faba3-5580-44df-965d-f562a99b291c",
-        "e246f6d8-78d7-44ac-b668-fcf47946cb50",
-        "8472fece-c7dd-4241-8d65-9b3cd1a0b568",
-        "88fe4b2d-3040-4c70-9a70-546a47764b48",
-        "d53ff5ee-3b1a-431e-b2be-30ed2673079b",
-        "72b810ef-4156-4d09-8f08-a0cf57e7cefe",
-        "6f81754e-285d-4ce0-b59e-af7edb02d108",
-        "b21acd93-60fd-4127-8a43-2f5178f4a830"
-    ]
-
-    for example_id in writer_list:
-        try:
-            main("libreoffice_writer", example_id)
-        except Exception as e:
-            logger.error("An error occurred while running the example: %s", e)
-            continue
-
-
diff --git a/experiment_screenshot_seeact.py b/experiment_screenshot_seeact.py
deleted file mode 100644
index 6c3a472..0000000
--- a/experiment_screenshot_seeact.py
+++ /dev/null
@@ -1,155 +0,0 @@
-import ctypes
-import datetime
-import json
-import logging
-import os
-import sys
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id):
-    action_space = "pyautogui"
-    gpt4_model = "gpt-4-vision-preview"
-    gemini_model = "gemini-pro-vision"
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
-                        action_space=action_space, exp="seeact")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "seeact", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    xx_list = [
-    ]
-    for example_id in xx_list:
-        main("xx", example_id)
diff --git a/experiment_screenshot_som.py b/experiment_screenshot_som.py
deleted file mode 100644
index 904435b..0000000
--- a/experiment_screenshot_som.py
+++ /dev/null
@@ -1,261 +0,0 @@
-#import ctypes
-import datetime
-import json
-import logging
-import os
-import sys
-import func_timeout
-
-from desktop_env.envs.desktop_env import DesktopEnv
-from mm_agents.gpt_4v_agent import GPT4v_Agent
-
-#  Logger Configs {{{ # 
-logger = logging.getLogger()
-logger.setLevel(logging.DEBUG)
-
-datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-
-file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
-debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
-stdout_handler = logging.StreamHandler(sys.stdout)
-sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
-
-file_handler.setLevel(logging.INFO)
-debug_handler.setLevel(logging.DEBUG)
-stdout_handler.setLevel(logging.INFO)
-sdebug_handler.setLevel(logging.DEBUG)
-
-formatter = logging.Formatter(
-    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
-file_handler.setFormatter(formatter)
-debug_handler.setFormatter(formatter)
-stdout_handler.setFormatter(formatter)
-sdebug_handler.setFormatter(formatter)
-
-stdout_handler.addFilter(logging.Filter("desktopenv"))
-sdebug_handler.addFilter(logging.Filter("desktopenv"))
-
-logger.addHandler(file_handler)
-logger.addHandler(debug_handler)
-logger.addHandler(stdout_handler)
-logger.addHandler(sdebug_handler)
-#  }}} Logger Configs # 
-
-logger = logging.getLogger("desktopenv.experiment")
-
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
-
-
-def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
-    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
-    env = DesktopEnv(
-        path_to_vm=PATH_TO_VM,
-        action_space=agent.action_space,
-        task_config=example
-    )
-    # reset the environment to certain snapshot
-    observation = env.reset()
-    done = False
-    step_num = 0
-
-    if recording:
-        # send a request to the server to start recording
-        env.controller.start_recording()
-
-    while not done and step_num < max_steps:
-        actions = agent.predict(observation)
-        step_num += 1
-        for action in actions:
-            # Capture the timestamp before executing the action
-            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
-            logger.info("Step %d: %s", step_num, action)
-
-            observation, reward, done, info = env.step(action)
-
-            logger.info("Reward: %.2f", reward)
-            logger.info("Done: %s", done)
-            logger.info("Info: %s", info)
-
-            # Save screenshot and trajectory information
-            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
-                with open(observation['screenshot'], "rb") as __f:
-                    screenshot = __f.read()
-                _f.write(screenshot)
-
-            with open(trajectory_recording_path, "a") as f:
-                f.write(json.dumps({
-                    "step_num": step_num,
-                    "action_timestamp": action_timestamp,
-                    "action": action,
-                    "reward": reward,
-                    "done": done,
-                    "info": info,
-                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
-                }))
-                f.write("\n")
-
-            if done:
-                logger.info("The episode is done.")
-                break
-
-    def stop_recording():
-        try:
-            env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
-        except Exception as e:
-            print(f"An error occurred while stopping the recording: {e}")
-
-    try:
-        func_timeout.func_timeout(30, stop_recording)
-    except func_timeout.exceptions.FunctionTimedOut:
-        logger.info("Recording timed out.")
-
-    result = env.evaluate()
-    logger.info("Result: %.2f", result)
-
-    with open(trajectory_recording_path, "a") as f:
-        f.write(json.dumps({
-            "result": result
-        }))
-        f.write("\n")
-
-    # env.close()
-    logger.info("Environment closed.")
-
-
-def main(example_class, example_id):
-    action_space = "pyautogui"
-    gpt4_model = "gpt-4-vision-preview"
-    gemini_model = "gemini-pro-vision"
-
-    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
-        example = json.load(f)
-    example["snapshot"] = "exp_v5"
-
-    logger.info("TASK: %s/%s", example_class, example_id)
-
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, max_tokens=1000, instruction=example['instruction'],
-                        action_space=action_space, exp="som")
-
-    # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
-
-    root_trajectory_dir = "exp_trajectory"
-
-    example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gpt4_model, example_id)
-    # example_trajectory_dir = os.path.join(root_trajectory_dir, "som", example_class, gemini_model, example_id)
-
-    os.makedirs(example_trajectory_dir, exist_ok=True)
-
-    run_one_example(example, agent, 15, example_trajectory_dir)
-
-
-if __name__ == '__main__':
-    from tqdm import tqdm
-    # impress_list = [
-    #     # "5d901039-a89c-4bfb-967b-bf66f4df075e",
-    #     "550ce7e7-747b-495f-b122-acdc4d0b8e54",
-    #     "455d3c66-7dc6-4537-a39a-36d3e9119df7",
-    #     "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
-    #     "c59742c0-4323-4b9d-8a02-723c251deaa0",
-    #     "ef9d12bd-bcee-4ba0-a40e-918400f43ddf",
-    #     "9ec204e4-f0a3-42f8-8458-b772a6797cab",
-    #     "0f84bef9-9790-432e-92b7-eece357603fb",
-    #     "ce88f674-ab7a-43da-9201-468d38539e4a",
-    #     "3b27600c-3668-4abd-8f84-7bcdebbccbdb",
-    #     "a097acff-6266-4291-9fbd-137af7ecd439",
-    #     "bf4e9888-f10f-47af-8dba-76413038b73c",
-    #     "21760ecb-8f62-40d2-8d85-0cee5725cb72"
-    # ]
-    # for example_id in impress_list:
-    #     main("libreoffice_impress", example_id)
-
-    vlc_list = [
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8ba5ae7a-5ae5-4eab-9fcc-5dd4fe3abf89",
-        "8f080098-ddb1-424c-b438-4e96e5e4786e",
-        "bba3381f-b5eb-4439-bd9e-80c22218d5a7",
-        "fba2c100-79e8-42df-ae74-b592418d54f4",
-        "efcf0d81-0835-4880-b2fd-d866e8bc2294",
-        "8d9fd4e2-6fdb-46b0-b9b9-02f06495c62f",
-        "aa4b5023-aef6-4ed9-bdc9-705f59ab9ad6",
-        "386dbd0e-0241-4a0a-b6a2-6704fba26b1c",
-        "9195653c-f4aa-453d-aa95-787f6ccfaae9",
-        "d06f0d4d-2cd5-4ede-8de9-598629438c6e",
-        "a5bbbcd5-b398-4c91-83d4-55e1e31bbb81",
-        "f3977615-2b45-4ac5-8bba-80c17dbe2a37",
-        "215dfd39-f493-4bc3-a027-8a97d72c61bf"
-    ]
-
-    # for example_id in tqdm(vlc_list):
-    #     try:
-    #         main("vlc", example_id)
-    #     except Exception as e:
-    #         print(f"An error occurred while running the example: {e}")
-    #         continue
-
-    chrome_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263",
-        "44ee5668-ecd5-4366-a6ce-c1c9b8d4e938",
-        "2ae9ba84-3a0d-4d4c-8338-3a1478dc5fe3",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "af630914-714e-4a24-a7bb-f9af687d3b91"
-    ]
-    for example_id in tqdm(chrome_list):
-        try:
-            main("chrome", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    vs_code_list = [
-        "0ed39f63-6049-43d4-ba4d-5fa2fe04a951",
-        "53ad5833-3455-407b-bbc6-45b4c79ab8fb",
-        "eabc805a-bfcf-4460-b250-ac92135819f6",
-        "982d12a5-beab-424f-8d38-d2a48429e511",
-        "4e60007a-f5be-4bfc-9723-c39affa0a6d3",
-        "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2",
-        "9439a27b-18ae-42d8-9778-5f68f891805e",
-        "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae",
-        "930fdb3b-11a8-46fe-9bac-577332e2640e",
-        "276cc624-87ea-4f08-ab93-f770e3790175",
-        "9d425400-e9b2-4424-9a4b-d4c7abac4140"
-    ]
-
-    for example_id in tqdm(vs_code_list):
-        try:
-            main("vs_code", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
-
-    thunderbird_list = [
-        "bb5e4c0d-f964-439c-97b6-bdb9747de3f4",
-        "7b6c7e24-c58a-49fc-a5bb-d57b80e5b4c3",
-        "12086550-11c0-466b-b367-1d9e75b3910e",
-        "06fe7178-4491-4589-810f-2e2bc9502122",
-        "6766f2b8-8a72-417f-a9e5-56fcaa735837",
-        "e1e75309-3ddb-4d09-92ec-de869c928143",
-        "3d1682a7-0fb0-49ae-a4dc-a73afd2d06d5",
-        "35253b65-1c19-4304-8aa4-6884b8218fc0",
-        "d088f539-cab4-4f9a-ac92-9999fc3a656e",
-        "2ad9387a-65d8-4e33-ad5b-7580065a27ca",
-        "480bcfea-d68f-4aaa-a0a9-2589ef319381",
-        "030eeff7-b492-4218-b312-701ec99ee0cc",
-        "94760984-3ff5-41ee-8347-cf1af709fea0",
-        "99146c54-4f37-4ab8-9327-5f3291665e1e",
-        "c9e7eaf2-b1a1-4efc-a982-721972fa9f02"
-    ]
-
-    for example_id in tqdm(thunderbird_list):
-        try:
-            main("thunderbird", example_id)
-        except Exception as e:
-            print(f"An error occurred while running the example: {e}")
-            continue
diff --git a/lib_run_single.py b/lib_run_single.py
new file mode 100644
index 0000000..82b2dd3
--- /dev/null
+++ b/lib_run_single.py
@@ -0,0 +1,72 @@
+import datetime
+import json
+import logging
+import os
+# import wandb
+
+from wrapt_timeout_decorator import *
+
+logger = logging.getLogger("desktopenv.experiment")
+
+# Open the JSON file
+with open("./settings.json", "r") as file:
+    # Load the JSON data from the file
+    data = json.load(file)
+time_limit = data["time_limit"]
+
+@timeout(time_limit, use_signals=False)
+def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+    agent.reset()
+    obs = env.reset(task_config=example)
+    done = False
+    step_idx = 0
+    env.controller.start_recording()
+    # str_table = wandb.Table(columns=["Screenshot", "A11T", "Modle Response", "Action", "Action timestamp", "Done"])
+    while not done and step_idx < max_steps:
+        response, actions = agent.predict(
+            instruction,
+            obs
+        )
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_idx + 1, action)
+            obs, reward, done, info = env.step(action, args.sleep_after_execution)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
+                      "wb") as _f:
+                with open(obs['screenshot'], "rb") as __f:
+                    screenshot = __f.read()
+                _f.write(screenshot)
+            # get a11tree and save to wandb
+            thisrun_a11tree = env.controller.get_accessibility_tree()
+            # str_table.add_data(wandb.Image(data_or_path=os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), caption=f"step_{step_idx + 1}_{action_timestamp}"),
+            #                 thisrun_a11tree,
+            #                 response, action, action_timestamp, done)
+            # run.log({"Reward": reward})
+            with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_idx + 1,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+            if done:
+                logger.info("The episode is done.")
+                break
+        step_idx += 1
+    # run.log({"str_trajectory": str_table})
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+    scores.append(result)
+    with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
+        f.write(f"{result}\n")
+    env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+    # run.log({"Result": result})
diff --git a/main.py b/main.py
index 5e82316..06debec 100644
--- a/main.py
+++ b/main.py
@@ -47,38 +47,38 @@ def human_agent():
     Runs the Gym environment with human input.
     """
     parser = argparse.ArgumentParser()
-    parser.add_argument('-p', '--path', type=str, required=True, help="Path to the virtual machine .vmx file.")
-    parser.add_argument('-s', '--snapshot', type=str, help="Name of the snapshot to restore.")
+    parser.add_argument('-p', '--path', type=str, default=r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu3\Ubuntu3.vmx", help="Path to the virtual machine .vmx file.")
+    parser.add_argument('-s', '--snapshot', type=str, default='init_state', help="Name of the snapshot to restore.")
     parser.add_argument('-e', '--example', type=str, help="Path to the example json file.")
     args = parser.parse_args(sys.argv[1:])
 
     example_path = args.example if args.example is not None and os.path.exists(args.example) else \
-        'evaluation_examples/examples/libreoffice_writer/6a33f9b9-0a56-4844-9c3f-96ec3ffb3ba2.json'
-    with open(example_path, "r") as f:
+        'evaluation_examples/examples/multi_apps/5990457f-2adb-467b-a4af-5c857c92d762.json'
+    with open(example_path, "r", encoding="utf-8") as f:
         example = json.load(f)
-    # change to your customized snapshot
-    if args.snapshot is not None: example["snapshot"] = args.snapshot
+        if args.snapshot is not None:
+            example['snapshot'] = args.snapshot
 
     assert os.path.exists(args.path), "The specified path to the .vmx file does not exist."
     env = DesktopEnv(
         path_to_vm=args.path,
-        action_space="computer_13",
-        task_config=example
+        snapshot_name=args.snapshot,
+        action_space="computer_13"
     )
     # reset the environment to certain snapshot
-    observation = env.reset()
-    logger.info('\x1b[32m[TASK INSTRUCTION]: \x1b[32;3m%s\x1b[0m', example["instruction"])
+    observation = env.reset(task_config=example)
     done = False
+    logger.info('\x1b[32m[TASK INSTRUCTION]: \x1b[32;3m%s\x1b[0m', example["instruction"])
 
     trajectory = [
-        # {
-        #     "action_type": "MOVE_TO",
-        #     "parameters": {
-        #         "x": 754,
-        #         "y": 1057
-        #     }
-        # },
-        # {"action_type": "CLICK", "parameters": {"button": "right", "num_clicks": 1}}
+        {
+            "action_type": "MOVE_TO",        #
+            "parameters": {
+                "x": 754,
+                "y": 1057
+            }
+        },
+        {"action_type": "CLICK", "parameters": {"button": "right", "num_clicks": 1}}
     ]
 
     for i in range(len(trajectory)):
diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index 337b402..e37f614 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -26,7 +26,7 @@ def find_leaf_nodes(xlm_file_str):
 
 state_ns = "uri:deskat:state.at-spi.gnome.org"
 component_ns = "uri:deskat:component.at-spi.gnome.org"
-def judge_node(node: ET, platform="ubuntu") -> bool:
+def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
     keeps: bool = node.tag.startswith("document")\
                or node.tag.endswith("item")\
                or node.tag.endswith("button")\
@@ -55,23 +55,25 @@ def judge_node(node: ET, platform="ubuntu") -> bool:
                      or platform=="windows"\
                         and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
                       )\
-                    and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
-                       or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
-                        )\
-                    and (node.get("name", "") != "" or node.text is not None and len(node.text)>0)
+                  and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
+                     or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
+                      )\
+                  and ( node.get("name", "") != "" or node.text is not None and len(node.text)>0\
+                     or check_image and node.get("image", "false")=="true"
+                      )
 
     coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
     sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
     keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
     return keeps
 
-def filter_nodes(root: ET, platform="ubuntu"):
+def filter_nodes(root: ET, platform="ubuntu", check_image=False):
     filtered_nodes = []
 
     for node in root.iter():
-        if judge_node(node, platform):
+        if judge_node(node, platform, check_image):
             filtered_nodes.append(node)
             #print(ET.tostring(node, encoding="unicode"))
 
@@ -155,12 +157,12 @@ def print_nodes_with_indent(nodes, indent=0):
 
 if __name__ == '__main__':
     import json
-    with open('4.json', 'r', encoding='utf-8') as f:
-        xml_file_str = json.load(f)["AT"]
+    with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
+        xml_file_str = f.read()
     filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
     print(len(filtered_nodes))
-    masks = draw_bounding_boxes( filtered_nodes, '4.png'
-                               , '4.a.png'
+    masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
+                               , 'selection_sorted(imaged).ai.png'
                                )
 
     # print(masks)
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/agent.py
similarity index 66%
rename from mm_agents/gpt_4v_agent.py
rename to mm_agents/agent.py
index 0c6c63e..ff92673 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/agent.py
@@ -5,10 +5,10 @@ import os
 import re
 import time
 import uuid
+import xml.etree.ElementTree as ET
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List
-import xml.etree.ElementTree as ET
 
 import backoff
 import dashscope
@@ -16,20 +16,13 @@ import google.generativeai as genai
 import openai
 import requests
 from PIL import Image
-from openai import (
-    APIConnectionError,
-    APIError,
-    RateLimitError
-)
+from google.api_core.exceptions import InvalidArgument
 
-from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import filter_nodes, draw_bounding_boxes
 from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
     SYS_PROMPT_IN_A11Y_OUT_CODE, SYS_PROMPT_IN_A11Y_OUT_ACTION, \
     SYS_PROMPT_IN_BOTH_OUT_CODE, SYS_PROMPT_IN_BOTH_OUT_ACTION, \
-    SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
-    SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
-
-import logging
+    SYS_PROMPT_IN_SOM_OUT_TAG
 
 logger = logging.getLogger("desktopenv.agent")
 
@@ -41,10 +34,10 @@ def encode_image(image_path):
 
 
 def linearize_accessibility_tree(accessibility_tree):
-    #leaf_nodes = find_leaf_nodes(accessibility_tree)
+    # leaf_nodes = find_leaf_nodes(accessibility_tree)
     filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
 
-    linearized_accessibility_tree = "tag\tname\ttext\tposition\tsize\n"
+    linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
     # Linearize the accessibility tree nodes into a table format
 
     for node in filtered_nodes:
@@ -72,7 +65,8 @@ def tag_screenshot(screenshot, accessibility_tree):
     uuid_str = str(uuid.uuid4())
     os.makedirs("tmp/images", exist_ok=True)
     tagged_screenshot_file_path = os.path.join("tmp/images", uuid_str + ".png")
-    nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
+    # nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
+    nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
     # Make tag screenshot
     marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
 
@@ -168,79 +162,66 @@ def parse_code_from_som_string(input_string, masks):
     return actions
 
 
-class GPT4v_Agent:
+class PromptAgent:
     def __init__(
             self,
-            api_key,
-            instruction,
             model="gpt-4-vision-preview",
-            max_tokens=500,
+            max_tokens=1500,
+            top_p=0.9,
+            temperature=0.5,
             action_space="computer_13",
-            exp="screenshot_a11y_tree"
-            # exp can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som", "seeact"]
+            observation_type="screenshot_a11y_tree",
+            # observation_type can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"]
+            max_trajectory_length=3
     ):
-
-        self.instruction = instruction
         self.model = model
         self.max_tokens = max_tokens
+        self.top_p = top_p
+        self.temperature = temperature
         self.action_space = action_space
-        self.exp = exp
-        self.max_trajectory_length = 3
-
-        self.headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}"
-        }
+        self.observation_type = observation_type
+        self.max_trajectory_length = max_trajectory_length
 
         self.thoughts = []
         self.actions = []
         self.observations = []
 
-        if exp == "screenshot":
+        if observation_type == "screenshot":
             if action_space == "computer_13":
                 self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION
             elif action_space == "pyautogui":
                 self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_CODE
             else:
                 raise ValueError("Invalid action space: " + action_space)
-        elif exp == "a11y_tree":
+        elif observation_type == "a11y_tree":
             if action_space == "computer_13":
                 self.system_message = SYS_PROMPT_IN_A11Y_OUT_ACTION
             elif action_space == "pyautogui":
                 self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE
             else:
                 raise ValueError("Invalid action space: " + action_space)
-        elif exp == "both":
+        elif observation_type == "screenshot_a11y_tree":
             if action_space == "computer_13":
                 self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION
             elif action_space == "pyautogui":
                 self.system_message = SYS_PROMPT_IN_BOTH_OUT_CODE
             else:
                 raise ValueError("Invalid action space: " + action_space)
-        elif exp == "som":
+        elif observation_type == "som":
             if action_space == "computer_13":
                 raise ValueError("Invalid action space: " + action_space)
             elif action_space == "pyautogui":
-                self.system_message = SYS_PROMPT_IN_SOM_A11Y_OUT_TAG
-            else:
-                raise ValueError("Invalid action space: " + action_space)
-        elif exp == "seeact":
-            if action_space == "computer_13":
-                raise ValueError("Invalid action space: " + action_space)
-            elif action_space == "pyautogui":
-                self.system_message = SYS_PROMPT_SEEACT
+                self.system_message = SYS_PROMPT_IN_SOM_OUT_TAG
             else:
                 raise ValueError("Invalid action space: " + action_space)
         else:
-            raise ValueError("Invalid experiment type: " + exp)
+            raise ValueError("Invalid experiment type: " + observation_type)
 
-        self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(
-            self.instruction)
-
-    def predict(self, obs: Dict) -> List:
+    def predict(self, instruction: str, obs: Dict) -> List:
         """
         Predict the next action(s) based on the current observation.
         """
+        system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)
 
         # Prepare the payload for the API call
         messages = []
@@ -251,7 +232,7 @@ class GPT4v_Agent:
             "content": [
                 {
                     "type": "text",
-                    "text": self.system_message
+                    "text": system_message
                 },
             ]
         })
@@ -272,7 +253,7 @@ class GPT4v_Agent:
         for previous_obs, previous_action, previous_thought in zip(_observations, _actions, _thoughts):
 
             # {{{1
-            if self.exp == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                 _screenshot = previous_obs["screenshot"]
                 _linearized_accessibility_tree = previous_obs["accessibility_tree"]
                 logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
@@ -294,18 +275,15 @@ class GPT4v_Agent:
                         }
                     ]
                 })
-            elif self.exp in ["som", "seeact"]:
+            elif self.observation_type in ["som"]:
                 _screenshot = previous_obs["screenshot"]
-                _linearized_accessibility_tree = previous_obs["accessibility_tree"]
-                logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
 
                 messages.append({
                     "role": "user",
                     "content": [
                         {
                             "type": "text",
-                            "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                                _linearized_accessibility_tree)
+                            "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
                         },
                         {
                             "type": "image_url",
@@ -316,7 +294,7 @@ class GPT4v_Agent:
                         }
                     ]
                 })
-            elif self.exp == "screenshot":
+            elif self.observation_type == "screenshot":
                 _screenshot = previous_obs["screenshot"]
 
                 messages.append({
@@ -335,7 +313,7 @@ class GPT4v_Agent:
                         }
                     ]
                 })
-            elif self.exp == "a11y_tree":
+            elif self.observation_type == "a11y_tree":
                 _linearized_accessibility_tree = previous_obs["accessibility_tree"]
 
                 messages.append({
@@ -349,7 +327,7 @@ class GPT4v_Agent:
                     ]
                 })
             else:
-                raise ValueError("Invalid experiment type: " + self.exp)  # 1}}}
+                raise ValueError("Invalid observation_type type: " + self.observation_type)  # 1}}}
 
             messages.append({
                 "role": "assistant",
@@ -362,11 +340,11 @@ class GPT4v_Agent:
             })
 
         # {{{1
-        if self.exp in ["screenshot", "both"]:
+        if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
             base64_image = encode_image(obs["screenshot"])
             linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
 
-            if self.exp == "both":
+            if self.observation_type == "screenshot_a11y_tree":
                 self.observations.append({
                     "screenshot": base64_image,
                     "accessibility_tree": linearized_accessibility_tree
@@ -383,7 +361,7 @@ class GPT4v_Agent:
                     {
                         "type": "text",
                         "text": "Given the screenshot as below. What's the next step that you will do to help with the task?"
-                        if self.exp == "screenshot"
+                        if self.observation_type == "screenshot"
                         else "Given the screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
                             linearized_accessibility_tree)
                     },
@@ -396,7 +374,7 @@ class GPT4v_Agent:
                     }
                 ]
             })
-        elif self.exp == "a11y_tree":
+        elif self.observation_type == "a11y_tree":
             linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
 
             self.observations.append({
@@ -414,15 +392,13 @@ class GPT4v_Agent:
                     }
                 ]
             })
-        elif self.exp == "som":
+        elif self.observation_type == "som":
             # Add som to the screenshot
             masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
             base64_image = encode_image(tagged_screenshot)
-            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
 
             self.observations.append({
-                "screenshot": base64_image,
-                "accessibility_tree": linearized_accessibility_tree
+                "screenshot": base64_image
             })
 
             messages.append({
@@ -430,35 +406,7 @@ class GPT4v_Agent:
                 "content": [
                     {
                         "type": "text",
-                        "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                            linearized_accessibility_tree)
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": f"data:image/png;base64,{base64_image}",
-                            "detail": "high"
-                        }
-                    }
-                ]
-            })
-        elif self.exp == "seeact":
-            # Add som to the screenshot
-            masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
-            base64_image = encode_image(tagged_screenshot)
-            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
-
-            self.observations.append({
-                "screenshot": base64_image,
-                "accessibility_tree": linearized_accessibility_tree
-            })
-
-            messages.append({
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": ACTION_DESCRIPTION_PROMPT_SEEACT.format(linearized_accessibility_tree)
+                        "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
                     },
                     {
                         "type": "image_url",
@@ -470,141 +418,244 @@ class GPT4v_Agent:
                 ]
             })
         else:
-            raise ValueError("Invalid experiment type: " + self.exp)  # 1}}}
-
-        with open("messages.json", "w") as f:
-            f.write(json.dumps(messages, indent=4))
+            raise ValueError("Invalid observation_type type: " + self.observation_type)  # 1}}}
 
+        # with open("messages.json", "w") as f:
+        #     f.write(json.dumps(messages, indent=4))
 
         response = self.call_llm({
             "model": self.model,
             "messages": messages,
-            "max_tokens": self.max_tokens
+            "max_tokens": self.max_tokens,
+            "top_p": self.top_p,
+            "temperature": self.temperature
         })
 
-        logger.debug("RESPONSE: %s", response)
-
-        if self.exp == "seeact":
-            messages.append({
-                "role": "assistant",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": response
-                    }
-                ]
-            })
-
-            messages.append({
-                "role": "user",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": "{}\n\nWhat's the next step that you will do to help with the task?".format(
-                            ACTION_GROUNDING_PROMPT_SEEACT)
-                    }
-                ]
-            })
-
-            response = self.call_llm({
-                "model": self.model,
-                "messages": messages,
-                "max_tokens": self.max_tokens
-            })
-            print(response)
+        logger.info("RESPONSE: %s", response)
 
         try:
             actions = self.parse_actions(response, masks)
             self.thoughts.append(response)
-        except Exception as e:
+        except ValueError as e:
             print("Failed to parse action from response", e)
             actions = None
             self.thoughts.append("")
 
-        return actions
+        return response, actions
 
     @backoff.on_exception(
         backoff.expo,
-        (APIError, RateLimitError, APIConnectionError),
-        max_tries=10
+        # here you should add more model exceptions as you want,
+        # but you are forbidden to add "Exception", that is, a common type of exception
+        # because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
+        (openai.RateLimitError,
+         openai.BadRequestError,
+         openai.InternalServerError,
+         InvalidArgument),
+        max_tries=5
     )
     def call_llm(self, payload):
+
         if self.model.startswith("gpt"):
+            headers = {
+                "Content-Type": "application/json",
+                "Authorization": f"Bearer {os.environ['OPENAI_API_KEY']}"
+            }
+            logger.info("Generating content with GPT model: %s", self.model)
             response = requests.post(
                 "https://api.openai.com/v1/chat/completions",
-                headers=self.headers,
+                headers=headers,
                 json=payload
             )
 
             if response.status_code != 200:
                 if response.json()['error']['code'] == "context_length_exceeded":
-                    print("Context length exceeded. Retrying with a smaller context.")
-                    payload["messages"] = payload["messages"][-1:]
+                    logger.error("Context length exceeded. Retrying with a smaller context.")
+                    payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:]
                     retry_response = requests.post(
                         "https://api.openai.com/v1/chat/completions",
-                        headers=self.headers,
+                        headers=headers,
                         json=payload
                     )
                     if retry_response.status_code != 200:
-                        print("Failed to call LLM: " + retry_response.text)
+                        logger.error(
+                            "Failed to call LLM even after attempt on shortening the history: " + retry_response.text)
                         return ""
 
-                print("Failed to call LLM: " + response.text)
+                logger.error("Failed to call LLM: " + response.text)
                 time.sleep(5)
                 return ""
             else:
                 return response.json()['choices'][0]['message']['content']
 
-        elif self.model.startswith("mistral"):
-            print("call mistral")
+        elif self.model.startswith("claude"):
             messages = payload["messages"]
             max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
+
+            claude_messages = []
+
+            for i, message in enumerate(messages):
+                claude_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+                assert len(message["content"]) in [1, 2], "One text, or one text with one image"
+                for part in message["content"]:
+
+                    if part['type'] == "image_url":
+                        image_source = {}
+                        image_source["type"] = "base64"
+                        image_source["media_type"] = "image/png"
+                        image_source["data"] = part['image_url']['url'].replace("data:image/png;base64,", "")
+                        claude_message['content'].append({"type": "image", "source": image_source})
+
+                    if part['type'] == "text":
+                        claude_message['content'].append({"type": "text", "text": part['text']})
+
+                claude_messages.append(claude_message)
+
+            # the claude not support system message in our endpoint, so we concatenate it at the first user message
+            if claude_messages[0]['role'] == "system":
+                claude_system_message_item = claude_messages[0]['content'][0]
+                claude_messages[1]['content'].insert(0, claude_system_message_item)
+                claude_messages.pop(0)
+
+            # headers = {
+            #     "x-api-key": os.environ["ANTHROPIC_API_KEY"],
+            #     "anthropic-version": "2023-06-01",
+            #     "content-type": "application/json"
+            # }
+
+            headers = {
+                "Accept": "application / json",
+                "Authorization": "Bearer " + os.environ["ANTHROPIC_API_KEY"],
+                "User-Agent": "Apifox/1.0.0 (https://apifox.com)",
+                "Content-Type": "application/json"
+            }
+
+            payload = {
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "messages": claude_messages,
+                "temperature": temperature,
+                "top_p": top_p
+            }
+
+            response = requests.post(
+                # "https://chat.claude.com/v1/chat/completions",
+                "https://api.aigcbest.top/v1/chat/completions",
+                headers=headers,
+                json=payload
+            )
+
+            if response.status_code != 200:
+
+                logger.error("Failed to call LLM: " + response.text)
+                time.sleep(5)
+                return ""
+            # else:
+            #     return response.json()['content'][0]['text']
+            else:
+                return response.json()['choices'][0]['message']['content']
+
+
+        elif self.model.startswith("mistral"):
+            print("Call mistral")
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
 
             misrtal_messages = []
 
             for i, message in enumerate(messages):
                 mistral_message = {
                     "role": message["role"],
-                    "content": []
+                    "content": ""
                 }
 
                 for part in message["content"]:
-                    mistral_message['content'] = part['text'] if part['type'] == "text" else None
+                    mistral_message['content'] = part['text'] if part['type'] == "text" else ""
 
                 misrtal_messages.append(mistral_message)
 
-            # the mistral not support system message in our endpoint, so we concatenate it at the first user message
-            if misrtal_messages[0]['role'] == "system":
-                misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
-                misrtal_messages.pop(0)
-
             # openai.api_base = "http://localhost:8000/v1"
-            # openai.api_key = "test"
             # response = openai.ChatCompletion.create(
             #     messages=misrtal_messages,
             #     model="Mixtral-8x7B-Instruct-v0.1"
             # )
 
             from openai import OpenAI
-            TOGETHER_API_KEY = "d011650e7537797148fb6170ec1e0be7ae75160375686fae02277136078e90d2"
 
-            client = OpenAI(api_key=TOGETHER_API_KEY,
+            client = OpenAI(api_key=os.environ["TOGETHER_API_KEY"],
                             base_url='https://api.together.xyz',
                             )
+            logger.info("Generating content with Mistral model: %s", self.model)
 
             response = client.chat.completions.create(
                 messages=misrtal_messages,
-                model="mistralai/Mixtral-8x7B-Instruct-v0.1",
-                max_tokens=1024
+                model=self.model,
+                max_tokens=max_tokens
             )
 
             try:
-                # return response['choices'][0]['message']['content']
                 return response.choices[0].message.content
             except Exception as e:
                 print("Failed to call LLM: " + str(e))
                 return ""
 
+        elif self.model.startswith("THUDM"):
+            # THUDM/cogagent-chat-hf
+            print("Call CogAgent")
+            messages = payload["messages"]
+            max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
+
+            cog_messages = []
+
+            for i, message in enumerate(messages):
+                cog_message = {
+                    "role": message["role"],
+                    "content": []
+                }
+
+                for part in message["content"]:
+                    if part['type'] == "image_url":
+                        cog_message['content'].append(
+                            {"type": "image_url", "image_url": {"url": part['image_url']['url']}})
+
+                    if part['type'] == "text":
+                        cog_message['content'].append({"type": "text", "text": part['text']})
+
+                cog_messages.append(cog_message)
+
+            # the cogagent not support system message in our endpoint, so we concatenate it at the first user message
+            if cog_messages[0]['role'] == "system":
+                cog_system_message_item = cog_messages[0]['content'][0]
+                cog_messages[1]['content'].insert(0, cog_system_message_item)
+                cog_messages.pop(0)
+
+            payload = {
+                "model": self.model,
+                "max_tokens": max_tokens,
+                "messages": cog_messages
+            }
+
+            base_url = "http://127.0.0.1:8000"
+
+            response = requests.post(f"{base_url}/v1/chat/completions", json=payload, stream=False)
+            if response.status_code == 200:
+                decoded_line = response.json()
+                content = decoded_line.get("choices", [{}])[0].get("message", "").get("content", "")
+                return content
+            else:
+                print("Failed to call LLM: ", response.status_code)
+                return ""
+
+
         elif self.model.startswith("gemini"):
             def encoded_img_to_pil_img(data_str):
                 base64_str = data_str.replace("data:image/png;base64,", "")
@@ -615,6 +666,8 @@ class GPT4v_Agent:
 
             messages = payload["messages"]
             max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
 
             gemini_messages = []
             for i, message in enumerate(messages):
@@ -645,24 +698,45 @@ class GPT4v_Agent:
                 gemini_messages[1]['parts'][0] = gemini_messages[0]['parts'][0] + "\n" + gemini_messages[1]['parts'][0]
                 gemini_messages.pop(0)
 
-            print(gemini_messages)
+            # since the gemini-pro-vision donnot support multi-turn message
+            if self.model == "gemini-pro-vision":
+                message_history_str = ""
+                for message in gemini_messages:
+                    message_history_str += "<|" + message['role'] + "|>\n" + message['parts'][0] + "\n"
+                gemini_messages = [{"role": "user", "parts": [message_history_str, gemini_messages[-1]['parts'][1]]}]
+                # gemini_messages[-1]['parts'][1].save("output.png", "PNG")
+
+            # print(gemini_messages)
             api_key = os.environ.get("GENAI_API_KEY")
             assert api_key is not None, "Please set the GENAI_API_KEY environment variable"
             genai.configure(api_key=api_key)
+            logger.info("Generating content with Gemini model: %s", self.model)
             response = genai.GenerativeModel(self.model).generate_content(
                 gemini_messages,
                 generation_config={
-                    "max_output_tokens": max_tokens
+                    "candidate_count": 1,
+                    "max_output_tokens": max_tokens,
+                    "top_p": top_p,
+                    "temperature": temperature
+                },
+                safety_settings={
+                    "harassment": "block_none",
+                    "hate": "block_none",
+                    "sex": "block_none",
+                    "danger": "block_none"
                 }
             )
 
             try:
                 return response.text
             except Exception as e:
+                logger.error("Meet exception when calling Gemini API, " + str(e))
                 return ""
         elif self.model.startswith("qwen"):
             messages = payload["messages"]
             max_tokens = payload["max_tokens"]
+            top_p = payload["top_p"]
+            temperature = payload["temperature"]
 
             qwen_messages = []
 
@@ -673,13 +747,16 @@ class GPT4v_Agent:
                 }
                 assert len(message["content"]) in [1, 2], "One text, or one text with one image"
                 for part in message["content"]:
-                    qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None
+                    qwen_message['content'].append({"image": part['image_url']['url']}) if part[
+                                                                                               'type'] == "image_url" else None
                     qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None
 
                 qwen_messages.append(qwen_message)
 
-            response = dashscope.MultiModalConversation.call(model='qwen-vl-plus',
-                                                             messages=messages)
+            response = dashscope.MultiModalConversation.call(
+                model='qwen-vl-plus',
+                messages=messages,  # todo: add the hyperparameters
+            )
             # The response status_code is HTTPStatus.OK indicate success,
             # otherwise indicate request is failed, you can get error code
             # and message from code and message.
@@ -698,7 +775,7 @@ class GPT4v_Agent:
 
     def parse_actions(self, response: str, masks=None):
 
-        if self.exp in ["screenshot", "a11y_tree", "both"]:
+        if self.observation_type in ["screenshot", "a11y_tree", "screenshot_a11y_tree"]:
             # parse from the response
             if self.action_space == "computer_13":
                 actions = parse_actions_from_string(response)
@@ -710,7 +787,7 @@ class GPT4v_Agent:
             self.actions.append(actions)
 
             return actions
-        elif self.exp in ["som", "seeact"]:
+        elif self.observation_type in ["som"]:
             # parse from the response
             if self.action_space == "computer_13":
                 raise ValueError("Invalid action space: " + self.action_space)
@@ -722,3 +799,8 @@ class GPT4v_Agent:
             self.actions.append(actions)
 
             return actions
+
+    def reset(self):
+        self.thoughts = []
+        self.actions = []
+        self.observations = []
diff --git a/mm_agents/configs/seem_focall_unicl_lang_v1.yaml b/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
deleted file mode 100644
index 23efe54..0000000
--- a/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
+++ /dev/null
@@ -1,401 +0,0 @@
-# --------------------------------------------------------
-# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Xueyan Zou (xueyan@cs.wisc.edu)
-# --------------------------------------------------------
-
-# Define Test/Trainer/Saving
-PIPELINE: XDecoderPipeline
-TRAINER: xdecoder
-SAVE_DIR: '../../data/output/test'
-base_path: "./"
-
-# Resume Logistic
-RESUME: false
-WEIGHT: false
-RESUME_FROM: ''
-EVAL_AT_START: False
-
-# Logging and Debug
-WANDB: False
-LOG_EVERY: 100
-FIND_UNUSED_PARAMETERS: false
-
-# Speed up training
-FP16: false
-PORT: '36873'
-
-# misc
-LOADER:
-  JOINT: False
-  KEY_DATASET: 'coco'
-
-##################
-# Task settings
-##################
-VERBOSE: true
-MODEL:
-  NAME: seem_model_v1
-  HEAD: xdecoder_head
-  MASK_ON: false
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  DIM_PROJ: 512
-  TEXT:
-    ARCH: vlpencoder
-    NAME: transformer
-    TOKENIZER: clip
-    CONTEXT_LENGTH: 77 # 77
-    WIDTH: 512
-    HEADS: 8
-    LAYERS: 12 # 6
-    AUTOGRESSIVE: True
-  BACKBONE:
-    NAME: focal
-    PRETRAINED: ''
-    LOAD_PRETRAINED: false
-    FOCAL:
-      PRETRAIN_IMG_SIZE: 224
-      PATCH_SIZE: 4
-      EMBED_DIM: 192
-      DEPTHS: [2, 2, 18, 2]
-      FOCAL_LEVELS: [4, 4, 4, 4]
-      FOCAL_WINDOWS: [3, 3, 3, 3]
-      DROP_PATH_RATE: 0.3
-      MLP_RATIO: 4.0
-      DROP_RATE: 0.0
-      PATCH_NORM: True
-      USE_CONV_EMBED: True
-      SCALING_MODULATOR: True
-      USE_CHECKPOINT: False
-      USE_POSTLN: true
-      USE_POSTLN_IN_MODULATION: false
-      USE_LAYERSCALE: True
-      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
-      OUT_INDICES: [0, 1, 2, 3]
-  ENCODER:
-    NAME: transformer_encoder_fpn
-    IGNORE_VALUE: 255
-    NUM_CLASSES: 133
-    LOSS_WEIGHT: 1.0
-    CONVS_DIM: 512
-    MASK_DIM: 512
-    NORM: "GN"
-    IN_FEATURES: ["res2", "res3", "res4", "res5"]
-    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
-    COMMON_STRIDE: 4
-    TRANSFORMER_ENC_LAYERS: 6
-  DECODER:
-    NAME: seem_v1
-    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
-    MASK:
-      ENABLED: True
-    DETECTION: False
-    SPATIAL:
-      ENABLED: True
-      MAX_ITER: 1
-    GROUNDING:
-      ENABLED: True
-      MAX_LEN: 5
-      TEXT_WEIGHT: 2.0
-      CLASS_WEIGHT: 0.5
-    RETRIEVAL:
-      ENABLED: False
-    LVIS:
-      ENABLED: True
-      THRES: 0.7
-    OPENIMAGE:
-      ENABLED: False
-      NEGATIVE_SAMPLES: 5
-      GROUNDING:
-        ENABLED: False
-        MAX_LEN: 5
-    CAPTION:
-      ENABLED: False
-      PHRASE_PROB: 0.5
-      SIM_THRES: 0.95
-    DEEP_SUPERVISION: True
-    NO_OBJECT_WEIGHT: 0.1
-    GCLASS_WEIGHT: 0.4
-    GMASK_WEIGHT: 1.0
-    GDICE_WEIGHT: 1.0
-    SCLASS_WEIGHT: 0.4
-    SMASK_WEIGHT: 1.0
-    SDICE_WEIGHT: 1.0
-    OCLASS_WEIGHT: 0.4
-    OMASK_WEIGHT: 1.0
-    ODICE_WEIGHT: 1.0
-    CLASS_WEIGHT: 2.0
-    MASK_WEIGHT: 5.0
-    DICE_WEIGHT: 5.0
-    BBOX_WEIGHT: 5.0
-    GIOU_WEIGHT: 2.0
-    CAPTION_WEIGHT: 2.0
-    COST_SPATIAL:
-      CLASS_WEIGHT: 5.0
-      MASK_WEIGHT: 2.0
-      DICE_WEIGHT: 2.0
-    HIDDEN_DIM: 512
-    NUM_OBJECT_QUERIES: 101
-    NHEADS: 8
-    DROPOUT: 0.0
-    DIM_FEEDFORWARD: 2048
-    MAX_SPATIAL_LEN: [512, 512, 512, 512]
-    # ENC_LAYERS: 0
-    PRE_NORM: False
-    ENFORCE_INPUT_PROJ: False
-    SIZE_DIVISIBILITY: 32
-    TRAIN_NUM_POINTS: 12544
-    OVERSAMPLE_RATIO: 3.0
-    IMPORTANCE_SAMPLE_RATIO: 0.75
-    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
-    TOP_GROUNDING_LAYERS: 10
-    TOP_CAPTION_LAYERS: 10
-    TOP_SPATIAL_LAYERS: 10
-    TOP_OPENIMAGE_LAYERS: 10
-    TEST:
-      SEMANTIC_ON: True
-      INSTANCE_ON: True
-      PANOPTIC_ON: True
-      OVERLAP_THRESHOLD: 0.8
-      OBJECT_MASK_THRESHOLD: 0.8
-      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
-
-# Spatial sampler
-STROKE_SAMPLER:
-  MAX_CANDIDATE: 1
-  CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
-  CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
-  DILATION: 3
-  CIRCLE:
-    NUM_STROKES: 5
-    STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
-    STROKE_PROB: [0.33, 0.33, 0.33]
-  SCRIBBLE:
-    NUM_STROKES: 5
-    STROKE_PRESET: ['rand_curve', 'rand_curve_small']
-    STROKE_PROB: [0.5, 0.5]
-  POINT:
-    NUM_POINTS: 20
-  POLYGON:
-    MAX_POINTS: 9
-  EVAL:
-    MODE: 'best' # best/random/best_random
-    NEGATIVE: False
-    MAX_ITER: 20
-    IOU_ITER: 1
-    GROUNDING: False
-
-# Multi-modal Architecture, order matters
-ATTENTION_ARCH:
-  VARIABLE:
-    queries: ['object', 'grounding', 'spatial']
-    tokens: ['grounding', 'spatial']
-    memories: ['spatial']
-  SELF_ATTENTION:
-    queries:
-      object: ['queries_object']
-      grounding: ['queries_grounding', 'tokens_grounding']
-      spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
-    tokens:
-      grounding: ['queries_grounding', 'tokens_grounding']
-      spatial: ['tokens_spatial']
-    memories:
-      spatial: ['memories_spatial']
-  CROSS_ATTENTION:
-    queries:
-      object: True
-      grounding: True
-      spatial: True
-    memories:
-      spatial: True
-    tokens:
-      grounding: False
-      spatial: False
-  MASKING: ['tokens_spatial', 'tokens_grounding']
-  DUPLICATION:
-    queries:
-      grounding: 'queries_object'
-      spatial: 'queries_object'
-  SPATIAL_MEMORIES: 32
-  QUERY_NUMBER: 3
-
-DATASETS:
-  TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
-  # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
-  TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"]  # to evaluate instance and semantic performance as well
-  # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
-  CLASS_CONCAT: false
-  SIZE_DIVISIBILITY: 32
-  PROPOSAL_FILES_TRAIN: []
-
-INPUT:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-TRAIN:
-  ASPECT_RATIO_GROUPING: true
-  BATCH_SIZE_TOTAL: 4
-  BATCH_SIZE_PER_GPU: 4
-  SHUFFLE: true
-
-TEST:
-  DETECTIONS_PER_IMAGE: 100
-  NAME: coco_eval
-  IOU_TYPE: ['bbox', 'segm']
-  USE_MULTISCALE: false
-  BATCH_SIZE_TOTAL: 8
-  MODEL_FILE: ''
-  AUG:
-    ENABLED: False
-
-DATALOADER:
-  FILTER_EMPTY_ANNOTATIONS: False
-  NUM_WORKERS: 8
-  LOAD_PROPOSALS: False
-  SAMPLER_TRAIN: "TrainingSampler"
-  ASPECT_RATIO_GROUPING: True
-
-COCO:
-  INPUT:
-    MIN_SIZE_TRAIN: 800
-    MAX_SIZE_TRAIN: 1333
-    MIN_SIZE_TRAIN_SAMPLING: 'choice'
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.1
-    MAX_SCALE: 2.0
-    DATASET_MAPPER_NAME: "coco_interactive"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'coco'
-
-# Validation dataset
-ADE20K:
-  INPUT:
-    MIN_SIZE_TRAIN: 640
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 640
-    MAX_SIZE_TRAIN: 2560
-    MAX_SIZE_TEST: 2560
-    MASK_FORMAT: "polygon"
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: (640, 640)
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    FORMAT: "RGB"
-  DATASET:
-    DATASET: 'ade'
-
-SBD:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 1
-
-VOC:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-DAVIS:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-VOS:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 1
-
-REF:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-    FORMAT: "RGB"
-    SPATIAL: False
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 4
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-# Detectron2 training config for optimizer and lr scheduler
-SOLVER:
-  BASE_LR: 0.0001
-  STEPS: [0.88889, 0.96296]
-  MAX_ITER: 1
-  GAMMA: 0.1
-  WARMUP_FACTOR: 1.0
-  WARMUP_ITERS: 10
-  WARMUP_METHOD: "linear"
-  WEIGHT_DECAY: 0.05
-  OPTIMIZER: "ADAMW"
-  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
-  LR_MULTIPLIER:
-    backbone: 0.1
-    lang_encoder: 0.1
-  FIX_PARAM:
-    backbone: True
-    lang_encoder: True
-    pixel_decoder: True
-  WEIGHT_DECAY_NORM: 0.0
-  WEIGHT_DECAY_EMBED: 0.0
-  CLIP_GRADIENTS:
-    ENABLED: True
-    CLIP_TYPE: "full_model"
-    CLIP_VALUE: 5.0 # 0.01
-    NORM_TYPE: 2.0
-  MAX_NUM_EPOCHS: 50
\ No newline at end of file
diff --git a/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml b/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
deleted file mode 100644
index 93abac6..0000000
--- a/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
+++ /dev/null
@@ -1,524 +0,0 @@
-# ------------------------------------------------------------------------
-# Semantic SAM
-# Copyright (c) MicroSoft, Inc. and its affiliates.
-# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
-# ------------------------------------------------------------------------
-
-##################
-# Task settings
-##################
-WEIGHT: ''
-PORT: 53711
-VERBOSE: true
-
-OUTPUT_DIR: '../../data/output/test'
-# misc
-LOADER:
-  JOINT: True
-  KEY_DATASET: 'coco'
-# model
-MODEL:
-  NAME: interactive_mask_dino
-  HEAD: general_head
-  MASK_ON: false
-  KEYPOINT_ON: false
-  LOAD_PROPOSALS: false
-  DIM_PROJ: 512
-  BACKBONE_DIM: 768
-  BACKGROUND: False
-  WEIGHTS: ''
-  TEXT:
-    ARCH: noencoder  # no language encoder for training only sa-1b data
-    NAME: transformer
-    TOKENIZER: clip
-    CONTEXT_LENGTH: 18 # 77
-    WIDTH: 512
-    HEADS: 8
-    LAYERS: 12 # 6
-    AUTOGRESSIVE: True
-  BACKBONE:
-    NAME: swin
-    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
-    LOAD_PRETRAINED: true
-    SWIN:
-      PRETRAIN_IMG_SIZE: 384
-      PATCH_SIZE: 4
-      EMBED_DIM: 192
-      DEPTHS: [ 2, 2, 18, 2 ]
-      NUM_HEADS: [ 6, 12, 24, 48 ]
-      WINDOW_SIZE: 12
-      MLP_RATIO: 4.0
-      QKV_BIAS: true
-      QK_SCALE: ~
-      DROP_RATE: 0.0
-      ATTN_DROP_RATE: 0.0
-      DROP_PATH_RATE: 0.3
-      APE: false
-      PATCH_NORM: true
-      USE_CHECKPOINT: false
-      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
-  ENCODER:
-    NAME: encoder_deform
-    IGNORE_VALUE: 255
-    NUM_CLASSES: 1
-    LOSS_WEIGHT: 1.0
-    CONVS_DIM: 256
-    MASK_DIM: 256
-    NORM: "GN"
-    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
-    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
-    COMMON_STRIDE: 4
-    TRANSFORMER_ENC_LAYERS: 6
-    TOTAL_NUM_FEATURE_LEVELS: 4
-    NUM_FEATURE_LEVELS: 3
-    FEATURE_ORDER: "low2high"
-  DECODER:
-    NAME: interactive_mask_dino
-    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
-    MASK: True
-    BOX: True
-    PART: True
-    GROUNDING:
-      ENABLED: False
-      MAX_LEN: 5
-      TEXT_WEIGHT: 2.0
-      CLASS_WEIGHT: 0.5
-    CAPTION:
-      ENABLED: False
-      PHRASE_PROB: 0.0
-      SIM_THRES: 0.95
-    CAPTIONING:
-      ENABLED: False
-      STEP: 50
-    RETRIEVAL:
-      ENABLED: False
-      DIM_IMG: 768
-      ENSEMBLE: True
-    OPENIMAGE:
-      ENABLED: False
-      NEGATIVE_SAMPLES: 5
-      GROUNDING:
-        ENABLED: False
-        MAX_LEN: 5
-    DEEP_SUPERVISION: True
-    NO_OBJECT_WEIGHT: 0.1
-    CLASS_WEIGHT: 4.0
-    MASK_WEIGHT: 5.0
-    DICE_WEIGHT: 5.0
-    BOX_WEIGHT: 5.0
-    GIOU_WEIGHT: 2.0
-    IOU_WEIGHT: 1.0
-    COST_CLASS_WEIGHT: 4.0
-    COST_DICE_WEIGHT: 5.0
-    COST_MASK_WEIGHT: 5.0
-    COST_BOX_WEIGHT: 5.0
-    COST_GIOU_WEIGHT: 2.0
-    HIDDEN_DIM: 256
-    NUM_OBJECT_QUERIES: 0
-    NHEADS: 8
-    DROPOUT: 0.0
-    DIM_FEEDFORWARD: 2048
-    ENC_LAYERS: 0
-    PRE_NORM: False
-    ENFORCE_INPUT_PROJ: False
-    SIZE_DIVISIBILITY: 32
-    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
-    TRAIN_NUM_POINTS: 12544
-    OVERSAMPLE_RATIO: 3.0
-    IMPORTANCE_SAMPLE_RATIO: 0.75
-    TWO_STAGE: False
-    INITIALIZE_BOX_TYPE: 'no'
-    DN: seg
-    DN_NOISE_SCALE: 0.4
-    DN_NUM: 100
-    INITIAL_PRED: False
-    LEARN_TGT: False
-    TOTAL_NUM_FEATURE_LEVELS: 4
-    SEMANTIC_CE_LOSS: False
-    PANO_BOX_LOSS: False
-    COCO: False
-    O365: False
-    SAM: True
-    PASCAL: False
-    RE_POINT: True
-    NUM_INTERACTIVE_TOKENS: 6
-    MAX_NUM_INSTANCE: 60
-    TEST:
-      SEMANTIC_ON: True
-      INSTANCE_ON: True
-      PANOPTIC_ON: True
-      BOX_INTERACTIVE: False
-      CLASSIFICATION_ON: False
-      OVERLAP_THRESHOLD: 0.8
-      OBJECT_MASK_THRESHOLD: 0.25
-      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
-      TEST_FOUCUS_ON_BOX: False
-      PANO_TRANSFORM_EVAL: True
-      PANO_TEMPERATURE: 0.06
-
-TEST:
-  EVAL_PERIOD: 500000
-  PRECISE_BN:
-    NUM_ITER: 1
-    ENABLED: False
-  AUG:
-    ENABLED: False
-
-SAM:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.99
-    MAX_SCALE: 1.01
-    DATASET_MAPPER_NAME: "sam"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'sam'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 4
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-COCO:
-  INPUT:
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-    IMAGE_SIZE: 1024
-    MIN_SCALE: 0.1
-    MAX_SCALE: 2.0
-    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    RANDOM_FLIP: "horizontal"
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  DATASET:
-    DATASET: 'coco'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 1
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 2
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-VLP:
-  INPUT:
-    IMAGE_SIZE: 224
-    DATASET_MAPPER_NAME: "vlpretrain"
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: False
-    SIZE_DIVISIBILITY: 32
-    MASK_FORMAT: "polygon"
-    FORMAT: "RGB"
-    CROP:
-      ENABLED: True
-  TRAIN:
-    BATCH_SIZE_TOTAL: 2
-    BATCH_SIZE_PER_GPU: 2
-  TEST:
-    BATCH_SIZE_TOTAL: 256
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 16
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-
-INPUT:
-  PIXEL_MEAN: [123.675, 116.280, 103.530]
-  PIXEL_STD: [58.395, 57.120, 57.375]
-
-DATASETS:
-  TRAIN: ["sam_train"]
-  # interactive segmentation evaluation.
-  TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
-#  TEST: ["sam_minival"]
-
-  CLASS_CONCAT: false
-  SIZE_DIVISIBILITY: 32
-  PROPOSAL_FILES_TRAIN: []
-
-DATALOADER:
-  FILTER_EMPTY_ANNOTATIONS: False
-  NUM_WORKERS: 16
-  LOAD_PROPOSALS: False
-  SAMPLER_TRAIN: "TrainingSampler"
-  ASPECT_RATIO_GROUPING: True
-
-# Detectron2 training config for optimizer and lr scheduler
-SOLVER:
-  BASE_LR_END: 0.0
-  MOMENTUM: 0.9
-  NESTEROV: False
-  CHECKPOINT_PERIOD: 5000
-  IMS_PER_BATCH: 1
-  REFERENCE_WORLD_SIZE: 0
-  BIAS_LR_FACTOR: 1.0
-  WEIGHT_DECAY_BIAS: None
-  # original
-  BASE_LR: 0.0001
-  STEPS: [327778, 355092]
-  MAX_ITER: 368750
-  GAMMA: 0.1
-  WARMUP_FACTOR: 1.0
-  WARMUP_ITERS: 10
-  WARMUP_METHOD: "linear"
-  WEIGHT_DECAY: 0.05
-  OPTIMIZER: "ADAMW"
-  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
-  LR_MULTIPLIER:
-    backbone: 0.1
-    lang_encoder: 0.1
-  WEIGHT_DECAY_NORM: 0.0
-  WEIGHT_DECAY_EMBED: 0.0
-  CLIP_GRADIENTS:
-    ENABLED: True
-    CLIP_TYPE: "full_model"
-    CLIP_VALUE: 0.01
-    NORM_TYPE: 2.0
-  AMP:
-    ENABLED: True
-
-# Evaluation Dataset
-ADE20K:
-  INPUT:
-    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 640
-    MAX_SIZE_TRAIN: 2560
-    MAX_SIZE_TEST: 2560
-    MASK_FORMAT: "polygon"
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: [640, 640]
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    FORMAT: "RGB"
-  DATASET:
-    DATASET: 'ade'
-  TRAIN:
-    ASPECT_RATIO_GROUPING: true
-    BATCH_SIZE_TOTAL: 16
-    BATCH_SIZE_PER_GPU: 2
-    SHUFFLE: true
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 8
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
-#ADE20K:
-#  INPUT:
-#    MIN_SIZE_TRAIN: 640
-#    MIN_SIZE_TRAIN_SAMPLING: "choice"
-#    MIN_SIZE_TEST: 640
-#    MAX_SIZE_TRAIN: 2560
-#    MAX_SIZE_TEST: 2560
-#    MASK_FORMAT: "polygon"
-#    CROP:
-#      ENABLED: True
-#      TYPE: "absolute"
-#      SIZE: (640, 640)
-#      SINGLE_CATEGORY_MAX_AREA: 1.0
-#    COLOR_AUG_SSD: True
-#    SIZE_DIVISIBILITY: 640  # used in dataset mapper
-#    DATASET_MAPPER_NAME: "mask_former_panoptic"
-#    FORMAT: "RGB"
-#  DATASET:
-#    DATASET: 'ade'
-#  TEST:
-#    BATCH_SIZE_TOTAL: 8
-
-
-REF:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-    FORMAT: "RGB"
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-SUN:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-SCAN:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 512
-    MAX_SIZE_TEST: 1024
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-BDD:
-  INPUT:
-    PIXEL_MEAN: [123.675, 116.280, 103.530]
-    PIXEL_STD: [58.395, 57.120, 57.375]
-    MIN_SIZE_TEST: 800
-    MAX_SIZE_TEST: 1333
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 0
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: False
-  TEST:
-    BATCH_SIZE_TOTAL: 8
-
-CITY:
-  INPUT:
-    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
-    MIN_SIZE_TRAIN_SAMPLING: "choice"
-    MIN_SIZE_TEST: 1024
-    MAX_SIZE_TRAIN: 4096
-    MAX_SIZE_TEST: 2048
-    CROP:
-      ENABLED: True
-      TYPE: "absolute"
-      SIZE: [ 512, 1024 ]
-      SINGLE_CATEGORY_MAX_AREA: 1.0
-    IGNORE_VALUE: 255
-    COLOR_AUG_SSD: True
-    SIZE_DIVISIBILITY: -1
-    FORMAT: "RGB"
-    DATASET_MAPPER_NAME: "mask_former_panoptic"
-    MASK_FORMAT: "polygon"
-    TEST:
-      EVAL_PERIOD: 5000
-      BATCH_SIZE_TOTAL: 1
-      AUG:
-        ENABLED: False
-        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
-        MAX_SIZE: 4096
-        FLIP: True
-    DATALOADER:
-      FILTER_EMPTY_ANNOTATIONS: True
-      NUM_WORKERS: 2
-      LOAD_PROPOSALS: False
-      SAMPLER_TRAIN: "TrainingSampler"
-      ASPECT_RATIO_GROUPING: True
-    TRAIN:
-      ASPECT_RATIO_GROUPING: true
-      BATCH_SIZE_TOTAL: 2
-      BATCH_SIZE_PER_GPU: 2
-      SHUFFLE: true
-
-PSACAL_PART:
-  INPUT:
-      MIN_SIZE_TEST: 800
-      MAX_SIZE_TEST: 1333
-      IMAGE_SIZE: 1024
-      MIN_SCALE: 0.1
-      MAX_SCALE: 2.0
-      DATASET_MAPPER_NAME: "pascal_part_lsj"
-      IGNORE_VALUE: 255
-      COLOR_AUG_SSD: False
-      SIZE_DIVISIBILITY: 32
-      RANDOM_FLIP: "horizontal"
-      MASK_FORMAT: "polygon"
-      FORMAT: "RGB"
-      CROP:
-        ENABLED: True
-  MODEL:
-    MASK_ON: True
-    KEYPOINT_ON: False
-    LOAD_PROPOSALS: False
-  # DATASET:
-  #   DATASET: 'coco'
-  TEST:
-    DETECTIONS_PER_IMAGE: 100
-    NAME: coco_eval
-    IOU_TYPE: ['bbox', 'segm']
-    USE_MULTISCALE: false
-    BATCH_SIZE_TOTAL: 8
-    MODEL_FILE: ''
-    AUG:
-      ENABLED: False
-  TRAIN:
-    BATCH_SIZE_TOTAL: 1
-    BATCH_SIZE_PER_GPU: 1
-    SHUFFLE: true
-  DATALOADER:
-    FILTER_EMPTY_ANNOTATIONS: False
-    NUM_WORKERS: 2
-    LOAD_PROPOSALS: False
-    SAMPLER_TRAIN: "TrainingSampler"
-    ASPECT_RATIO_GROUPING: True
diff --git a/mm_agents/llm_server/CogAgent/CogAgent.py b/mm_agents/llm_server/CogAgent/CogAgent.py
new file mode 100644
index 0000000..1b4cd53
--- /dev/null
+++ b/mm_agents/llm_server/CogAgent/CogAgent.py
@@ -0,0 +1,405 @@
+import os
+import gc
+import time
+import base64
+
+from contextlib import asynccontextmanager
+from typing import List, Literal, Union, Tuple, Optional
+import torch
+import uvicorn
+from fastapi import FastAPI, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from loguru import logger
+from pydantic import BaseModel, Field
+from sse_starlette.sse import EventSourceResponse
+from transformers import AutoModelForCausalLM, LlamaTokenizer, PreTrainedModel, PreTrainedTokenizer, \
+    TextIteratorStreamer
+from PIL import Image
+from io import BytesIO
+
+MODEL_PATH = os.environ.get('MODEL_PATH', 'THUDM/cogvlm-chat-hf')
+TOKENIZER_PATH = os.environ.get("TOKENIZER_PATH", 'lmsys/vicuna-7b-v1.5')
+DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
+if os.environ.get('QUANT_ENABLED'):
+    QUANT_ENABLED = True
+else:
+    with torch.cuda.device(DEVICE):
+        __, total_bytes = torch.cuda.mem_get_info()
+        total_gb = total_bytes / (1 << 30)
+        if total_gb < 40:
+            QUANT_ENABLED = True
+        else:
+            QUANT_ENABLED = False
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """
+    An asynchronous context manager for managing the lifecycle of the FastAPI app.
+    It ensures that GPU memory is cleared after the app's lifecycle ends, which is essential for efficient resource management in GPU environments.
+    """
+    yield
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+
+
+app = FastAPI(lifespan=lifespan)
+
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+
+
+class ModelCard(BaseModel):
+    """
+    A Pydantic model representing a model card, which provides metadata about a machine learning model.
+    It includes fields like model ID, owner, and creation time.
+    """
+    id: str
+    object: str = "model"
+    created: int = Field(default_factory=lambda: int(time.time()))
+    owned_by: str = "owner"
+    root: Optional[str] = None
+    parent: Optional[str] = None
+    permission: Optional[list] = None
+
+
+class ModelList(BaseModel):
+    object: str = "list"
+    data: List[ModelCard] = []
+
+
+class ImageUrl(BaseModel):
+    url: str
+
+
+class TextContent(BaseModel):
+    type: Literal["text"]
+    text: str
+
+
+class ImageUrlContent(BaseModel):
+    type: Literal["image_url"]
+    image_url: ImageUrl
+
+
+ContentItem = Union[TextContent, ImageUrlContent]
+
+
+class ChatMessageInput(BaseModel):
+    role: Literal["user", "assistant", "system"]
+    content: Union[str, List[ContentItem]]
+    name: Optional[str] = None
+
+
+class ChatMessageResponse(BaseModel):
+    role: Literal["assistant"]
+    content: str = None
+    name: Optional[str] = None
+
+
+class DeltaMessage(BaseModel):
+    role: Optional[Literal["user", "assistant", "system"]] = None
+    content: Optional[str] = None
+
+
+class ChatCompletionRequest(BaseModel):
+    model: str
+    messages: List[ChatMessageInput]
+    temperature: Optional[float] = 0.8
+    top_p: Optional[float] = 0.8
+    max_tokens: Optional[int] = None
+    stream: Optional[bool] = False
+    # Additional parameters
+    repetition_penalty: Optional[float] = 1.0
+
+
+class ChatCompletionResponseChoice(BaseModel):
+    index: int
+    message: ChatMessageResponse
+
+
+class ChatCompletionResponseStreamChoice(BaseModel):
+    index: int
+    delta: DeltaMessage
+
+
+class UsageInfo(BaseModel):
+    prompt_tokens: int = 0
+    total_tokens: int = 0
+    completion_tokens: Optional[int] = 0
+
+
+class ChatCompletionResponse(BaseModel):
+    model: str
+    object: Literal["chat.completion", "chat.completion.chunk"]
+    choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
+    created: Optional[int] = Field(default_factory=lambda: int(time.time()))
+    usage: Optional[UsageInfo] = None
+
+
+@app.get("/v1/models", response_model=ModelList)
+async def list_models():
+    """
+    An endpoint to list available models. It returns a list of model cards.
+    This is useful for clients to query and understand what models are available for use.
+    """
+    model_card = ModelCard(id="cogvlm-chat-17b")  # can be replaced by your model id like cogagent-chat-18b
+    return ModelList(data=[model_card])
+
+
+@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
+async def create_chat_completion(request: ChatCompletionRequest):
+    global model, tokenizer
+
+    if len(request.messages) < 1 or request.messages[-1].role == "assistant":
+        raise HTTPException(status_code=400, detail="Invalid request")
+
+    gen_params = dict(
+        messages=request.messages,
+        temperature=request.temperature,
+        top_p=request.top_p,
+        max_tokens=request.max_tokens or 1024,
+        echo=False,
+        stream=request.stream,
+    )
+
+    if request.stream:
+        generate = predict(request.model, gen_params)
+        return EventSourceResponse(generate, media_type="text/event-stream")
+    response = generate_cogvlm(model, tokenizer, gen_params)
+
+    usage = UsageInfo()
+
+    message = ChatMessageResponse(
+        role="assistant",
+        content=response["text"],
+    )
+    logger.debug(f"==== message ====\n{message}")
+    choice_data = ChatCompletionResponseChoice(
+        index=0,
+        message=message,
+    )
+    task_usage = UsageInfo.model_validate(response["usage"])
+    for usage_key, usage_value in task_usage.model_dump().items():
+        setattr(usage, usage_key, getattr(usage, usage_key) + usage_value)
+    return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion", usage=usage)
+
+
+async def predict(model_id: str, params: dict):
+    """
+    Handle streaming predictions. It continuously generates responses for a given input stream.
+    This is particularly useful for real-time, continuous interactions with the model.
+    """
+
+    global model, tokenizer
+
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(role="assistant"),
+        finish_reason=None
+    )
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+    previous_text = ""
+    for new_response in generate_stream_cogvlm(model, tokenizer, params):
+        decoded_unicode = new_response["text"]
+        delta_text = decoded_unicode[len(previous_text):]
+        previous_text = decoded_unicode
+        delta = DeltaMessage(
+            content=delta_text,
+            role="assistant",
+        )
+        choice_data = ChatCompletionResponseStreamChoice(
+            index=0,
+            delta=delta,
+        )
+        chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+        yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+    choice_data = ChatCompletionResponseStreamChoice(
+        index=0,
+        delta=DeltaMessage(),
+    )
+    chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
+    yield "{}".format(chunk.model_dump_json(exclude_unset=True))
+
+
+def generate_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    """
+    Generates a response using the CogVLM model. It processes the chat history and image data, if any,
+    and then invokes the model to generate a response.
+    """
+
+    for response in generate_stream_cogvlm(model, tokenizer, params):
+        pass
+    return response
+
+
+def process_history_and_images(messages: List[ChatMessageInput]) -> Tuple[
+    Optional[str], Optional[List[Tuple[str, str]]], Optional[List[Image.Image]]]:
+    """
+    Process history messages to extract text, identify the last user query,
+    and convert base64 encoded image URLs to PIL images.
+
+    Args:
+        messages(List[ChatMessageInput]): List of ChatMessageInput objects.
+    return: A tuple of three elements:
+             - The last user query as a string.
+             - Text history formatted as a list of tuples for the model.
+             - List of PIL Image objects extracted from the messages.
+    """
+    formatted_history = []
+    image_list = []
+    last_user_query = ''
+
+    for i, message in enumerate(messages):
+        role = message.role
+        content = message.content
+
+        if isinstance(content, list):  # text
+            text_content = ' '.join(item.text for item in content if isinstance(item, TextContent))
+        else:
+            text_content = content
+
+        if isinstance(content, list):  # image
+            for item in content:
+                if isinstance(item, ImageUrlContent):
+                    image_url = item.image_url.url
+                    if image_url.startswith("data:image/jpeg;base64,"):
+                        base64_encoded_image = image_url.split("data:image/jpeg;base64,")[1]
+                        image_data = base64.b64decode(base64_encoded_image)
+                        image = Image.open(BytesIO(image_data)).convert('RGB')
+                        image_list.append(image)
+                    elif image_url.startswith("data:image/png;base64,"):
+                        base64_encoded_image = image_url.split("data:image/png;base64,")[1]
+                        image_data = base64.b64decode(base64_encoded_image)
+                        image = Image.open(BytesIO(image_data)).convert('RGB')
+                        image_list.append(image)
+
+        if role == 'user':
+            if i == len(messages) - 1:  # 最后一条用户消息
+                last_user_query = text_content
+            else:
+                formatted_history.append((text_content, ''))
+        elif role == 'assistant':
+            if formatted_history:
+                if formatted_history[-1][1] != '':
+                    assert False, f"the last query is answered. answer again. {formatted_history[-1][0]}, {formatted_history[-1][1]}, {text_content}"
+                formatted_history[-1] = (formatted_history[-1][0], text_content)
+            else:
+                assert False, f"assistant reply before user"
+        else:
+            assert False, f"unrecognized role: {role}"
+
+    return last_user_query, formatted_history, image_list
+
+
+@torch.inference_mode()
+def generate_stream_cogvlm(model: PreTrainedModel, tokenizer: PreTrainedTokenizer, params: dict):
+    """
+    Generates a stream of responses using the CogVLM model in inference mode.
+    It's optimized to handle continuous input-output interactions with the model in a streaming manner.
+    """
+    messages = params["messages"]
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    max_new_tokens = int(params.get("max_tokens", 256))
+    query, history, image_list = process_history_and_images(messages)
+
+    logger.debug(f"==== request ====\n{query}")
+
+    input_by_model = model.build_conversation_input_ids(tokenizer, query=query, history=history,
+                                                        images=[image_list[-1]])
+    inputs = {
+        'input_ids': input_by_model['input_ids'].unsqueeze(0).to(DEVICE),
+        'token_type_ids': input_by_model['token_type_ids'].unsqueeze(0).to(DEVICE),
+        'attention_mask': input_by_model['attention_mask'].unsqueeze(0).to(DEVICE),
+        'images': [[input_by_model['images'][0].to(DEVICE).to(torch_type)]],
+    }
+    if 'cross_images' in input_by_model and input_by_model['cross_images']:
+        inputs['cross_images'] = [[input_by_model['cross_images'][0].to(DEVICE).to(torch_type)]]
+
+    input_echo_len = len(inputs["input_ids"][0])
+    streamer = TextIteratorStreamer(
+        tokenizer=tokenizer,
+        timeout=60.0,
+        skip_prompt=True,
+        skip_special_tokens=True
+)
+    gen_kwargs = {
+        "repetition_penalty": repetition_penalty,
+        "max_new_tokens": max_new_tokens,
+        "do_sample": True if temperature > 1e-5 else False,
+        "top_p": top_p if temperature > 1e-5 else 0,
+        'streamer': streamer,
+    }
+    if temperature > 1e-5:
+        gen_kwargs["temperature"] = temperature
+
+    total_len = 0
+    generated_text = ""
+    with torch.no_grad():
+        model.generate(**inputs, **gen_kwargs)
+        for next_text in streamer:
+            generated_text += next_text
+            yield {
+                "text": generated_text,
+                "usage": {
+                    "prompt_tokens": input_echo_len,
+                    "completion_tokens": total_len - input_echo_len,
+                    "total_tokens": total_len,
+                },
+            }
+    ret = {
+        "text": generated_text,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": total_len - input_echo_len,
+            "total_tokens": total_len,
+        },
+    }
+    yield ret
+
+
+gc.collect()
+torch.cuda.empty_cache()
+
+if __name__ == "__main__":
+    tokenizer = LlamaTokenizer.from_pretrained(
+        TOKENIZER_PATH,
+        trust_remote_code=True)
+
+    if torch.cuda.is_available() and torch.cuda.get_device_capability()[0] >= 8:
+        torch_type = torch.bfloat16
+    else:
+        torch_type = torch.float16
+
+    print("========Use torch type as:{} with device:{}========\n\n".format(torch_type, DEVICE))
+
+    if 'cuda' in DEVICE:
+        if QUANT_ENABLED:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH,
+                load_in_4bit=True,
+                trust_remote_code=True,
+                torch_dtype=torch_type,
+                low_cpu_mem_usage=True
+            ).eval()
+        else:
+            model = AutoModelForCausalLM.from_pretrained(
+                MODEL_PATH,
+                load_in_4bit=False,
+                trust_remote_code=True,
+                torch_dtype=torch_type,
+                low_cpu_mem_usage=True
+            ).to(DEVICE).eval()
+            
+    else:
+        model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, trust_remote_code=True).float().to(DEVICE).eval()
+    uvicorn.run(app, host='0.0.0.0', port=8000, workers=1)
diff --git a/mm_agents/llm_server/CogAgent/README.md b/mm_agents/llm_server/CogAgent/README.md
new file mode 100644
index 0000000..b6f61d2
--- /dev/null
+++ b/mm_agents/llm_server/CogAgent/README.md
@@ -0,0 +1,7 @@
+## Deploy CogAgent as server
+
+```
+python CogAgent.py
+```
+
+The CogAgent LLM will be deployed on http://127.0.0.1:8000
\ No newline at end of file
diff --git a/mm_agents/ops/functions/__init__.py b/mm_agents/ops/functions/__init__.py
deleted file mode 100644
index 2b06b5a..0000000
--- a/mm_agents/ops/functions/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from .ms_deform_attn_func import MSDeformAttnFunction
-
diff --git a/mm_agents/ops/functions/ms_deform_attn_func.py b/mm_agents/ops/functions/ms_deform_attn_func.py
deleted file mode 100644
index 94a36ab..0000000
--- a/mm_agents/ops/functions/ms_deform_attn_func.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import torch
-import torch.nn.functional as F
-from torch.autograd import Function
-from torch.autograd.function import once_differentiable
-
-try:
-    import MultiScaleDeformableAttention as MSDA
-except ModuleNotFoundError as e:
-    info_string = (
-        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
-        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
-        "\t`sh make.sh`\n"
-    )
-    raise ModuleNotFoundError(info_string)
-
-
-class MSDeformAttnFunction(Function):
-    @staticmethod
-    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
-        ctx.im2col_step = im2col_step
-        output = MSDA.ms_deform_attn_forward(
-            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
-        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
-        return output
-
-    @staticmethod
-    @once_differentiable
-    def backward(ctx, grad_output):
-        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
-        grad_value, grad_sampling_loc, grad_attn_weight = \
-            MSDA.ms_deform_attn_backward(
-                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
-
-        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
-
-
-def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
-    # for debug and test only,
-    # need to use cuda version instead
-    N_, S_, M_, D_ = value.shape
-    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
-    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
-    sampling_grids = 2 * sampling_locations - 1
-    sampling_value_list = []
-    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
-        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
-        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
-        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
-        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
-        # N_*M_, D_, Lq_, P_
-        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
-                                          mode='bilinear', padding_mode='zeros', align_corners=False)
-        sampling_value_list.append(sampling_value_l_)
-    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
-    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
-    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
-    return output.transpose(1, 2).contiguous()
diff --git a/mm_agents/ops/make.sh b/mm_agents/ops/make.sh
deleted file mode 100755
index 7b38cdb..0000000
--- a/mm_agents/ops/make.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-#!/usr/bin/env bash
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-python setup.py build install
diff --git a/mm_agents/ops/modules/__init__.py b/mm_agents/ops/modules/__init__.py
deleted file mode 100644
index 6fdbf03..0000000
--- a/mm_agents/ops/modules/__init__.py
+++ /dev/null
@@ -1,12 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from .ms_deform_attn import MSDeformAttn
diff --git a/mm_agents/ops/modules/ms_deform_attn.py b/mm_agents/ops/modules/ms_deform_attn.py
deleted file mode 100644
index e7b4c42..0000000
--- a/mm_agents/ops/modules/ms_deform_attn.py
+++ /dev/null
@@ -1,125 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import warnings
-import math
-
-import torch
-from torch import nn
-import torch.nn.functional as F
-from torch.nn.init import xavier_uniform_, constant_
-
-from ..functions import MSDeformAttnFunction
-from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
-
-
-def _is_power_of_2(n):
-    if (not isinstance(n, int)) or (n < 0):
-        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
-    return (n & (n-1) == 0) and n != 0
-
-
-class MSDeformAttn(nn.Module):
-    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
-        """
-        Multi-Scale Deformable Attention Module
-        :param d_model      hidden dimension
-        :param n_levels     number of feature levels
-        :param n_heads      number of attention heads
-        :param n_points     number of sampling points per attention head per feature level
-        """
-        super().__init__()
-        if d_model % n_heads != 0:
-            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
-        _d_per_head = d_model // n_heads
-        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
-        if not _is_power_of_2(_d_per_head):
-            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
-                          "which is more efficient in our CUDA implementation.")
-
-        self.im2col_step = 128
-
-        self.d_model = d_model
-        self.n_levels = n_levels
-        self.n_heads = n_heads
-        self.n_points = n_points
-
-        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
-        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
-        self.value_proj = nn.Linear(d_model, d_model)
-        self.output_proj = nn.Linear(d_model, d_model)
-
-        self._reset_parameters()
-
-    def _reset_parameters(self):
-        constant_(self.sampling_offsets.weight.data, 0.)
-        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
-        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
-        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
-        for i in range(self.n_points):
-            grid_init[:, :, i, :] *= i + 1
-        with torch.no_grad():
-            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
-        constant_(self.attention_weights.weight.data, 0.)
-        constant_(self.attention_weights.bias.data, 0.)
-        xavier_uniform_(self.value_proj.weight.data)
-        constant_(self.value_proj.bias.data, 0.)
-        xavier_uniform_(self.output_proj.weight.data)
-        constant_(self.output_proj.bias.data, 0.)
-
-    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
-        """
-        :param query                       (N, Length_{query}, C)
-        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
-                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
-        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
-        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
-        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
-        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
-
-        :return output                     (N, Length_{query}, C)
-        """
-        N, Len_q, _ = query.shape
-        N, Len_in, _ = input_flatten.shape
-        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
-
-        value = self.value_proj(input_flatten)
-        if input_padding_mask is not None:
-            value = value.masked_fill(input_padding_mask[..., None], float(0))
-        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
-        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
-        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
-        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
-        # N, Len_q, n_heads, n_levels, n_points, 2
-        if reference_points.shape[-1] == 2:
-            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
-            sampling_locations = reference_points[:, :, None, :, None, :] \
-                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
-        elif reference_points.shape[-1] == 4:
-            sampling_locations = reference_points[:, :, None, :, None, :2] \
-                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
-        else:
-            raise ValueError(
-                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
-        try:
-            output = MSDeformAttnFunction.apply(
-                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
-        except:
-            # CPU
-            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        # # For FLOPs calculation only
-        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
-        output = self.output_proj(output)
-        return output
diff --git a/mm_agents/ops/setup.py b/mm_agents/ops/setup.py
deleted file mode 100644
index 3b57ad3..0000000
--- a/mm_agents/ops/setup.py
+++ /dev/null
@@ -1,78 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-import os
-import glob
-
-import torch
-
-from torch.utils.cpp_extension import CUDA_HOME
-from torch.utils.cpp_extension import CppExtension
-from torch.utils.cpp_extension import CUDAExtension
-
-from setuptools import find_packages
-from setuptools import setup
-
-requirements = ["torch", "torchvision"]
-
-def get_extensions():
-    this_dir = os.path.dirname(os.path.abspath(__file__))
-    extensions_dir = os.path.join(this_dir, "src")
-
-    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
-    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
-    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
-
-    sources = main_file + source_cpu
-    extension = CppExtension
-    extra_compile_args = {"cxx": []}
-    define_macros = []
-
-    # Force cuda since torch ask for a device, not if cuda is in fact available.
-    if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
-        extension = CUDAExtension
-        sources += source_cuda
-        define_macros += [("WITH_CUDA", None)]
-        extra_compile_args["nvcc"] = [
-            "-DCUDA_HAS_FP16=1",
-            "-D__CUDA_NO_HALF_OPERATORS__",
-            "-D__CUDA_NO_HALF_CONVERSIONS__",
-            "-D__CUDA_NO_HALF2_OPERATORS__",
-        ]
-    else:
-        if CUDA_HOME is None:
-            raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
-        else:
-            raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
-
-    sources = [os.path.join(extensions_dir, s) for s in sources]
-    include_dirs = [extensions_dir]
-    ext_modules = [
-        extension(
-            "MultiScaleDeformableAttention",
-            sources,
-            include_dirs=include_dirs,
-            define_macros=define_macros,
-            extra_compile_args=extra_compile_args,
-        )
-    ]
-    return ext_modules
-
-setup(
-    name="MultiScaleDeformableAttention",
-    version="1.0",
-    author="Weijie Su",
-    url="https://github.com/fundamentalvision/Deformable-DETR",
-    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
-    packages=find_packages(exclude=("configs", "tests",)),
-    ext_modules=get_extensions(),
-    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
-)
diff --git a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp
deleted file mode 100644
index 48757e2..0000000
--- a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp
+++ /dev/null
@@ -1,46 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <vector>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    AT_ERROR("Not implement on cpu");
-}
-
diff --git a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h
deleted file mode 100644
index 51bb27e..0000000
--- a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h
+++ /dev/null
@@ -1,38 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor
-ms_deform_attn_cpu_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor>
-ms_deform_attn_cpu_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
-
diff --git a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu
deleted file mode 100644
index 0c465da..0000000
--- a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu
+++ /dev/null
@@ -1,158 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <vector>
-#include "cuda/ms_deform_im2col_cuda.cuh"
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-
-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-    
-    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
-
-    const int batch_n = im2col_step_;
-    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto columns = output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
-            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
-                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                spatial_shapes.data<int64_t>(),
-                level_start_index.data<int64_t>(),
-                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                columns.data<scalar_t>());
-
-        }));
-    }
-
-    output = output.view({batch, num_query, num_heads*channels});
-
-    return output;
-}
-
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-
-    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
-    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
-    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
-    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
-    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
-    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
-
-    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
-    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
-    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
-    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
-    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
-    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
-
-    const int batch = value.size(0);
-    const int spatial_size = value.size(1);
-    const int num_heads = value.size(2);
-    const int channels = value.size(3);
-
-    const int num_levels = spatial_shapes.size(0);
-
-    const int num_query = sampling_loc.size(1);
-    const int num_point = sampling_loc.size(4);
-
-    const int im2col_step_ = std::min(batch, im2col_step);
-
-    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
-
-    auto grad_value = at::zeros_like(value);
-    auto grad_sampling_loc = at::zeros_like(sampling_loc);
-    auto grad_attn_weight = at::zeros_like(attn_weight);
-
-    const int batch_n = im2col_step_;
-    auto per_value_size = spatial_size * num_heads * channels;
-    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
-    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
-    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
-    
-    for (int n = 0; n < batch/im2col_step_; ++n)
-    {
-        auto grad_output_g = grad_output_n.select(0, n);
-        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
-            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
-                                    grad_output_g.data<scalar_t>(),
-                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
-                                    spatial_shapes.data<int64_t>(),
-                                    level_start_index.data<int64_t>(),
-                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
-                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
-                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
-                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
-                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
-
-        }));
-    }
-
-    return {
-        grad_value, grad_sampling_loc, grad_attn_weight
-    };
-}
\ No newline at end of file
diff --git a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h
deleted file mode 100644
index 4f0658e..0000000
--- a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h
+++ /dev/null
@@ -1,35 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-#include <torch/extension.h>
-
-at::Tensor ms_deform_attn_cuda_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step);
-
-std::vector<at::Tensor> ms_deform_attn_cuda_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step);
-
diff --git a/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh b/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh
deleted file mode 100644
index c04e0d4..0000000
--- a/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh
+++ /dev/null
@@ -1,1332 +0,0 @@
-/*!
-**************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************
-* Modified from DCN (https://github.com/msracver/Deformable-ConvNets)
-* Copyright (c) 2018 Microsoft
-**************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include <cstdio>
-#include <algorithm>
-#include <cstring>
-
-#include <ATen/ATen.h>
-#include <ATen/cuda/CUDAContext.h>
-
-#include <THC/THCAtomics.cuh>
-
-#define CUDA_KERNEL_LOOP(i, n)                          \
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x;   \
-      i < (n);                                          \
-      i += blockDim.x * gridDim.x)
-
-const int CUDA_NUM_THREADS = 1024;
-inline int GET_BLOCKS(const int N, const int num_threads)
-{
-  return (N + num_threads - 1) / num_threads;
-}
-
-
-template <typename scalar_t>
-__device__ scalar_t ms_deform_attn_im2col_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-  }
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  return val;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  *grad_attn_weight = top_grad * val;
-  *grad_sampling_loc = width * grad_w_weight * top_grad_value;
-  *(grad_sampling_loc + 1) = height * grad_h_weight * top_grad_value;
-}
-
-
-template <typename scalar_t>
-__device__ void ms_deform_attn_col2im_bilinear_gm(const scalar_t* &bottom_data, 
-                                                   const int &height, const int &width, const int &nheads, const int &channels,
-                                                   const scalar_t &h, const scalar_t &w, const int &m, const int &c,
-                                                   const scalar_t &top_grad,
-                                                   const scalar_t &attn_weight,
-                                                   scalar_t* &grad_value, 
-                                                   scalar_t* grad_sampling_loc,
-                                                   scalar_t* grad_attn_weight)
-{
-  const int h_low = floor(h);
-  const int w_low = floor(w);
-  const int h_high = h_low + 1;
-  const int w_high = w_low + 1;
-
-  const scalar_t lh = h - h_low;
-  const scalar_t lw = w - w_low;
-  const scalar_t hh = 1 - lh, hw = 1 - lw;
-
-  const int w_stride = nheads * channels;
-  const int h_stride = width * w_stride;
-  const int h_low_ptr_offset = h_low * h_stride;
-  const int h_high_ptr_offset = h_low_ptr_offset + h_stride;
-  const int w_low_ptr_offset = w_low * w_stride;
-  const int w_high_ptr_offset = w_low_ptr_offset + w_stride;
-  const int base_ptr = m * channels + c;
-
-  const scalar_t w1 = hh * hw, w2 = hh * lw, w3 = lh * hw, w4 = lh * lw;
-  const scalar_t top_grad_value = top_grad * attn_weight;
-  scalar_t grad_h_weight = 0, grad_w_weight = 0;
-
-  scalar_t v1 = 0;
-  if (h_low >= 0 && w_low >= 0)
-  {
-    const int ptr1 = h_low_ptr_offset + w_low_ptr_offset + base_ptr;
-    v1 = bottom_data[ptr1];
-    grad_h_weight -= hw * v1;
-    grad_w_weight -= hh * v1;
-    atomicAdd(grad_value+ptr1, w1*top_grad_value);
-  }
-  scalar_t v2 = 0;
-  if (h_low >= 0 && w_high <= width - 1)
-  {
-    const int ptr2 = h_low_ptr_offset + w_high_ptr_offset + base_ptr;
-    v2 = bottom_data[ptr2];
-    grad_h_weight -= lw * v2;
-    grad_w_weight += hh * v2;
-    atomicAdd(grad_value+ptr2, w2*top_grad_value);
-  }
-  scalar_t v3 = 0;
-  if (h_high <= height - 1 && w_low >= 0)
-  {
-    const int ptr3 = h_high_ptr_offset + w_low_ptr_offset + base_ptr;
-    v3 = bottom_data[ptr3];
-    grad_h_weight += hw * v3;
-    grad_w_weight -= lh * v3;
-    atomicAdd(grad_value+ptr3, w3*top_grad_value); 
-  }
-  scalar_t v4 = 0;
-  if (h_high <= height - 1 && w_high <= width - 1)
-  {
-    const int ptr4 = h_high_ptr_offset + w_high_ptr_offset + base_ptr;
-    v4 = bottom_data[ptr4];
-    grad_h_weight += lw * v4;
-    grad_w_weight += lh * v4;
-    atomicAdd(grad_value+ptr4, w4*top_grad_value);
-  }
-
-  const scalar_t val = (w1 * v1 + w2 * v2 + w3 * v3 + w4 * v4);
-  atomicAdd(grad_attn_weight, top_grad * val); 
-  atomicAdd(grad_sampling_loc, width * grad_w_weight * top_grad_value);
-  atomicAdd(grad_sampling_loc + 1, height * grad_h_weight * top_grad_value);
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_im2col_gpu_kernel(const int n,
-                                                const scalar_t *data_value, 
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *data_col)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    scalar_t *data_col_ptr = data_col + index;
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-    scalar_t col = 0;
-    
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const scalar_t *data_value_ptr = data_value + (data_value_ptr_init_offset + level_start_id * qid_stride);
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          col += ms_deform_attn_im2col_bilinear(data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col) * weight;
-        }
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-      }
-    }
-    *data_col_ptr = col;
-  }
-}
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockSize; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t, unsigned int blockSize>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    __shared__ scalar_t cache_grad_sampling_loc[blockSize * 2];
-    __shared__ scalar_t cache_grad_attn_weight[blockSize];
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockSize/2; s>0; s>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        { 
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v1(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-        if (tid == 0)
-        {
-          scalar_t _grad_w=cache_grad_sampling_loc[0], _grad_h=cache_grad_sampling_loc[1], _grad_a=cache_grad_attn_weight[0];
-          int sid=2;
-          for (unsigned int tid = 1; tid < blockDim.x; ++tid)
-          {
-            _grad_w += cache_grad_sampling_loc[sid];
-            _grad_h += cache_grad_sampling_loc[sid + 1];
-            _grad_a += cache_grad_attn_weight[tid];
-            sid += 2;
-          }
-          
-          
-          *grad_sampling_loc = _grad_w;
-          *(grad_sampling_loc + 1) = _grad_h;
-          *grad_attn_weight = _grad_a;
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            } 
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          *grad_sampling_loc = cache_grad_sampling_loc[0];
-          *(grad_sampling_loc + 1) = cache_grad_sampling_loc[1];
-          *grad_attn_weight = cache_grad_attn_weight[0];
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    extern __shared__ int _s[];
-    scalar_t* cache_grad_sampling_loc = (scalar_t*)_s;
-    scalar_t* cache_grad_attn_weight = cache_grad_sampling_loc + 2 * blockDim.x;
-    unsigned int tid = threadIdx.x;
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        *(cache_grad_sampling_loc+(threadIdx.x << 1)) = 0;
-        *(cache_grad_sampling_loc+((threadIdx.x << 1) + 1)) = 0;
-        *(cache_grad_attn_weight+threadIdx.x)=0;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            cache_grad_sampling_loc+(threadIdx.x << 1), cache_grad_attn_weight+threadIdx.x);
-        }
-        
-        __syncthreads();
-
-        for (unsigned int s=blockDim.x/2, spre=blockDim.x; s>0; s>>=1, spre>>=1)
-        {
-          if (tid < s) {
-            const unsigned int xid1 = tid << 1;
-            const unsigned int xid2 = (tid + s) << 1;
-            cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + s];
-            cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2];
-            cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1];
-            if (tid + (s << 1) < spre)
-            {
-              cache_grad_attn_weight[tid] += cache_grad_attn_weight[tid + (s << 1)];
-              cache_grad_sampling_loc[xid1] += cache_grad_sampling_loc[xid2 + (s << 1)];
-              cache_grad_sampling_loc[xid1 + 1] += cache_grad_sampling_loc[xid2 + 1 + (s << 1)];
-            }
-          }
-          __syncthreads();
-        }
-
-        if (tid == 0)
-        {
-          atomicAdd(grad_sampling_loc, cache_grad_sampling_loc[0]);
-          atomicAdd(grad_sampling_loc + 1, cache_grad_sampling_loc[1]);
-          atomicAdd(grad_attn_weight, cache_grad_attn_weight[0]);
-        }
-        __syncthreads();
-
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-__global__ void ms_deformable_col2im_gpu_kernel_gm(const int n,
-                                                const scalar_t *grad_col,
-                                                const scalar_t *data_value,
-                                                const int64_t *data_spatial_shapes,
-                                                const int64_t *data_level_start_index, 
-                                                const scalar_t *data_sampling_loc,
-                                                const scalar_t *data_attn_weight,
-                                                const int batch_size, 
-                                                const int spatial_size, 
-                                                const int num_heads,
-                                                const int channels, 
-                                                const int num_levels,
-                                                const int num_query,
-                                                const int num_point,
-                                                scalar_t *grad_value,
-                                                scalar_t *grad_sampling_loc,
-                                                scalar_t *grad_attn_weight)
-{
-  CUDA_KERNEL_LOOP(index, n)
-  {
-    int _temp = index;
-    const int c_col = _temp % channels;
-    _temp /= channels;
-    const int sampling_index = _temp; 
-    const int m_col = _temp % num_heads;
-    _temp /= num_heads;
-    const int q_col = _temp % num_query;
-    _temp /= num_query;
-    const int b_col = _temp;
-
-    const scalar_t top_grad = grad_col[index];
-
-    int data_weight_ptr = sampling_index * num_levels * num_point;
-    int data_loc_w_ptr = data_weight_ptr << 1;
-    const int grad_sampling_ptr = data_weight_ptr;
-    grad_sampling_loc += grad_sampling_ptr << 1;
-    grad_attn_weight += grad_sampling_ptr;
-    const int grad_weight_stride = 1;
-    const int grad_loc_stride = 2;
-    const int qid_stride = num_heads * channels;
-    const int data_value_ptr_init_offset = b_col * spatial_size * qid_stride;
-
-    for (int l_col=0; l_col < num_levels; ++l_col)
-    {
-      const int level_start_id = data_level_start_index[l_col];
-      const int spatial_h_ptr = l_col << 1;
-      const int spatial_h = data_spatial_shapes[spatial_h_ptr];
-      const int spatial_w = data_spatial_shapes[spatial_h_ptr + 1];
-      const int value_ptr_offset = data_value_ptr_init_offset + level_start_id * qid_stride;
-      const scalar_t *data_value_ptr = data_value + value_ptr_offset;
-      scalar_t *grad_value_ptr = grad_value + value_ptr_offset;
-
-      for (int p_col=0; p_col < num_point; ++p_col)
-      {
-        const scalar_t loc_w = data_sampling_loc[data_loc_w_ptr];
-        const scalar_t loc_h = data_sampling_loc[data_loc_w_ptr + 1];
-        const scalar_t weight = data_attn_weight[data_weight_ptr];
-
-        const scalar_t h_im = loc_h * spatial_h - 0.5;
-        const scalar_t w_im = loc_w * spatial_w - 0.5;
-        if (h_im > -1 && w_im > -1 && h_im < spatial_h && w_im < spatial_w)
-        {
-          ms_deform_attn_col2im_bilinear_gm(
-            data_value_ptr, spatial_h, spatial_w, num_heads, channels, h_im, w_im, m_col, c_col,
-            top_grad, weight, grad_value_ptr, 
-            grad_sampling_loc, grad_attn_weight);
-        }
-        data_weight_ptr += 1;
-        data_loc_w_ptr += 2;
-        grad_attn_weight += grad_weight_stride;
-        grad_sampling_loc += grad_loc_stride;
-      }
-    }
-  }
-}
-
-
-template <typename scalar_t>
-void ms_deformable_im2col_cuda(cudaStream_t stream,
-                              const scalar_t* data_value,
-                              const int64_t* data_spatial_shapes, 
-                              const int64_t* data_level_start_index, 
-                              const scalar_t* data_sampling_loc,
-                              const scalar_t* data_attn_weight,
-                              const int batch_size,
-                              const int spatial_size, 
-                              const int num_heads, 
-                              const int channels, 
-                              const int num_levels, 
-                              const int num_query,
-                              const int num_point,
-                              scalar_t* data_col)
-{
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  const int num_threads = CUDA_NUM_THREADS;
-  ms_deformable_im2col_gpu_kernel<scalar_t>
-      <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-          0, stream>>>(
-      num_kernels, data_value, data_spatial_shapes, data_level_start_index, data_sampling_loc, data_attn_weight, 
-      batch_size, spatial_size, num_heads, channels, num_levels, num_query, num_point, data_col);
-  
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_im2col_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
-
-template <typename scalar_t>
-void ms_deformable_col2im_cuda(cudaStream_t stream,
-                              const scalar_t* grad_col,
-                              const scalar_t* data_value,
-                              const int64_t * data_spatial_shapes,
-                              const int64_t * data_level_start_index,
-                              const scalar_t * data_sampling_loc,
-                              const scalar_t * data_attn_weight,
-                              const int batch_size, 
-                              const int spatial_size, 
-                              const int num_heads,
-                              const int channels, 
-                              const int num_levels,
-                              const int num_query,
-                              const int num_point, 
-                              scalar_t* grad_value,
-                              scalar_t* grad_sampling_loc,
-                              scalar_t* grad_attn_weight)
-{
-  const int num_threads = (channels > CUDA_NUM_THREADS)?CUDA_NUM_THREADS:channels;
-  const int num_kernels = batch_size * num_query * num_heads * channels;
-  const int num_actual_kernels = batch_size * num_query * num_heads * channels;
-  if (channels > 1024)
-  {
-    if ((channels & 1023) == 0)
-    {
-      ms_deformable_col2im_gpu_kernel_shm_reduce_v2_multi_blocks<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-    }
-    else
-    {
-      ms_deformable_col2im_gpu_kernel_gm<scalar_t>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-    }
-  }
-  else{
-    switch(channels)
-    {
-      case 1:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 1>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 2:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 2>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 4:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 4>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 8:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 8>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 16:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 16>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 32:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v1<scalar_t, 32>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 64:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 64>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 128:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 128>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 256:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 256>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 512:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 512>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      case 1024:
-        ms_deformable_col2im_gpu_kernel_shm_blocksize_aware_reduce_v2<scalar_t, 1024>
-        <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-            0, stream>>>(
-                      num_kernels, 
-                      grad_col,
-                      data_value,
-                      data_spatial_shapes,
-                      data_level_start_index, 
-                      data_sampling_loc,
-                      data_attn_weight,
-                      batch_size, 
-                      spatial_size, 
-                      num_heads,
-                      channels, 
-                      num_levels,
-                      num_query,
-                      num_point,
-                      grad_value,
-                      grad_sampling_loc,
-                      grad_attn_weight);
-        break;
-      default:
-        if (channels < 64)
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v1<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-        else
-        {
-          ms_deformable_col2im_gpu_kernel_shm_reduce_v2<scalar_t>
-          <<<GET_BLOCKS(num_actual_kernels, num_threads), num_threads,
-              num_threads*3*sizeof(scalar_t), stream>>>(
-                        num_kernels, 
-                        grad_col,
-                        data_value,
-                        data_spatial_shapes,
-                        data_level_start_index, 
-                        data_sampling_loc,
-                        data_attn_weight,
-                        batch_size, 
-                        spatial_size, 
-                        num_heads,
-                        channels, 
-                        num_levels,
-                        num_query,
-                        num_point,
-                        grad_value,
-                        grad_sampling_loc,
-                        grad_attn_weight);
-        }
-    }
-  }
-  cudaError_t err = cudaGetLastError();
-  if (err != cudaSuccess)
-  {
-    printf("error in ms_deformable_col2im_cuda: %s\n", cudaGetErrorString(err));
-  }
-
-}
\ No newline at end of file
diff --git a/mm_agents/ops/src/ms_deform_attn.h b/mm_agents/ops/src/ms_deform_attn.h
deleted file mode 100644
index 2f80a1b..0000000
--- a/mm_agents/ops/src/ms_deform_attn.h
+++ /dev/null
@@ -1,67 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#pragma once
-
-#include "cpu/ms_deform_attn_cpu.h"
-
-#ifdef WITH_CUDA
-#include "cuda/ms_deform_attn_cuda.h"
-#endif
-
-
-at::Tensor
-ms_deform_attn_forward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const int im2col_step)
-{
-    if (value.type().is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_forward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
-
-std::vector<at::Tensor>
-ms_deform_attn_backward(
-    const at::Tensor &value, 
-    const at::Tensor &spatial_shapes,
-    const at::Tensor &level_start_index,
-    const at::Tensor &sampling_loc,
-    const at::Tensor &attn_weight,
-    const at::Tensor &grad_output,
-    const int im2col_step)
-{
-    if (value.type().is_cuda())
-    {
-#ifdef WITH_CUDA
-        return ms_deform_attn_cuda_backward(
-            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
-#else
-        AT_ERROR("Not compiled with GPU support");
-#endif
-    }
-    AT_ERROR("Not implemented on the CPU");
-}
-
diff --git a/mm_agents/ops/src/vision.cpp b/mm_agents/ops/src/vision.cpp
deleted file mode 100644
index 4a08821..0000000
--- a/mm_agents/ops/src/vision.cpp
+++ /dev/null
@@ -1,21 +0,0 @@
-/*!
-**************************************************************************************************
-* Deformable DETR
-* Copyright (c) 2020 SenseTime. All Rights Reserved.
-* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-**************************************************************************************************
-* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-**************************************************************************************************
-*/
-
-/*!
-* Copyright (c) Facebook, Inc. and its affiliates.
-* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-*/
-
-#include "ms_deform_attn.h"
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
-  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
-}
diff --git a/mm_agents/ops/test.py b/mm_agents/ops/test.py
deleted file mode 100644
index 6e1b545..0000000
--- a/mm_agents/ops/test.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# ------------------------------------------------------------------------------------------------
-# Deformable DETR
-# Copyright (c) 2020 SenseTime. All Rights Reserved.
-# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
-# ------------------------------------------------------------------------------------------------
-# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
-# ------------------------------------------------------------------------------------------------
-
-# Copyright (c) Facebook, Inc. and its affiliates.
-# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
-
-from __future__ import absolute_import
-from __future__ import print_function
-from __future__ import division
-
-import time
-import torch
-import torch.nn as nn
-from torch.autograd import gradcheck
-
-from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
-
-
-N, M, D = 1, 2, 2
-Lq, L, P = 2, 2, 2
-shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
-level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
-S = sum([(H*W).item() for H, W in shapes])
-
-
-torch.manual_seed(3)
-
-
-@torch.no_grad()
-def check_forward_equal_with_pytorch_double():
-    value = torch.rand(N, S, M, D).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
-    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
-    fwdok = torch.allclose(output_cuda, output_pytorch)
-    max_abs_err = (output_cuda - output_pytorch).abs().max()
-    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
-
-    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
-
-
-@torch.no_grad()
-def check_forward_equal_with_pytorch_float():
-    value = torch.rand(N, S, M, D).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
-    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
-    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
-    max_abs_err = (output_cuda - output_pytorch).abs().max()
-    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
-
-    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
-
-
-def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
-
-    value = torch.rand(N, S, M, channels).cuda() * 0.01
-    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
-    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
-    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
-    im2col_step = 2
-    func = MSDeformAttnFunction.apply
-
-    value.requires_grad = grad_value
-    sampling_locations.requires_grad = grad_sampling_loc
-    attention_weights.requires_grad = grad_attn_weight
-
-    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
-
-    print(f'* {gradok} check_gradient_numerical(D={channels})')
-
-
-if __name__ == '__main__':
-    check_forward_equal_with_pytorch_double()
-    check_forward_equal_with_pytorch_float()
-
-    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
-        check_gradient_numerical(channels, True, True, True)
-
-
-
diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py
index 15aefeb..462aac7 100644
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -798,10 +798,10 @@ You MUST choose and ONLY CHOOSE from the action space above, otherwise your acti
 You CAN predict multiple actions at one step, but you should only return one action for each step.
 """.strip()
 
-SYS_PROMPT_IN_SOM_A11Y_OUT_TAG = """
+SYS_PROMPT_IN_SOM_OUT_TAG = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. 
+For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
 
 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as:
diff --git a/mm_agents/task_adapter/sam/__init__.py b/mm_agents/task_adapter/sam/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/mm_agents/task_adapter/sam/tasks/__Init__.py b/mm_agents/task_adapter/sam/tasks/__Init__.py
deleted file mode 100644
index ce45369..0000000
--- a/mm_agents/task_adapter/sam/tasks/__Init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .inference_sam_m2m_auto import *
-from .inference_sam_m2m_interactive import *
\ No newline at end of file
diff --git a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_auto.py b/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_auto.py
deleted file mode 100644
index d51cf75..0000000
--- a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_auto.py
+++ /dev/null
@@ -1,103 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-import matplotlib.pyplot as plt
-import cv2
-import io
-from segment_anything import SamAutomaticMaskGenerator
-
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-
-def inference_sam_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-    image_ori = np.asarray(image_ori)
-
-    mask_generator = SamAutomaticMaskGenerator(model)
-    outputs = mask_generator.generate(image_ori)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    # for ann in sorted_anns:
-    #     mask = ann['segmentation']
-    #     color_mask = np.random.random((1, 3)).tolist()[0]
-    #     # color_mask = [int(c*255) for c in color_mask]
-    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-    #     label += 1
-    # im = demo.get_image()
-
-    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
-    for i, ann in enumerate(sorted_anns):
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        # assign the mask to the mask_map
-        mask_map[mask == 1] = label
-        label += 1
-    im = demo.get_image()    
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im, sorted_anns
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_interactive.py b/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
deleted file mode 100644
index 5752138..0000000
--- a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
+++ /dev/null
@@ -1,221 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-from kornia.contrib import distance_transform
-import matplotlib.pyplot as plt
-import cv2
-import io
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-from segment_anything import SamAutomaticMaskGenerator
-from segment_anything.utils.amg import (
-    MaskData,
-    area_from_rle,
-    batch_iterator,
-    batched_mask_to_box,
-    box_xyxy_to_xywh,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    coco_encode_rle,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    mask_to_rle_pytorch,
-    remove_small_regions,
-    rle_to_mask,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-    uncrop_points,
-)
-
-def sam_interactive_mask(mask_generator, points, in_points, in_labels, mask_input):
-    masks, iou_preds, _ = mask_generator.predictor.predict_torch(
-            in_points,
-            in_labels,
-            mask_input=mask_input,
-            multimask_output=True,
-            return_logits=True,
-    )
-    nm,_,h,w = masks.shape
-
-    # Serialize predictions and store in MaskData
-    data = MaskData(
-            masks=masks.flatten(0, 1),
-            iou_preds=iou_preds.flatten(0, 1),
-            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
-    )
-    del masks
-
-    # Calculate stability score
-    data["stability_score"] = calculate_stability_score(
-            data["masks"], mask_generator.predictor.model.mask_threshold, mask_generator.stability_score_offset
-    )
-
-    masks = data["masks"].reshape(nm, -1, h, w)
-    scores = (data['iou_preds'] + data['stability_score']).reshape(nm, -1)
-
-    index = torch.stack([torch.arange(nm).cuda(), scores.argmax(dim=1)]).tolist()
-    return masks[index]
-
-def inference_sam_m2m_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-
-    orig_size = images.shape[-2:]
-    orig_h, orig_w = orig_size
-    crop_box = [0,0,orig_w,orig_h]
-
-    spatial_masks = spatial_masks[:, None].float().cuda()
-    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
-
-    # generate single center point
-    # n,_,h,w = spatial_masks.shape
-    # mask_dt = (distance_transform((~F.pad(spatial_masks, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
-    # max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
-    # next_mask = torch.zeros(spatial_masks.shape, device=torch.cuda.current_device()).bool()
-    # next_mask = next_mask.view(n,-1)
-    # next_mask[max_xy_idx] = True
-    # next_mask = next_mask.reshape((n,1,h,w))
-    # points = next_mask.nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
-
-    # stack sampled points
-    acc_points = []
-    for i in range(len(spatial_masks)):
-        points = spatial_masks[i:i+1].nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
-        rand_ids = np.random.choice(points.shape[0], size=40, replace=True)
-        points = points[rand_ids]
-        acc_points.append(points)
-    _np = len(acc_points)
-    points = np.concatenate(acc_points)
-
-    mask_generator = SamAutomaticMaskGenerator(model)
-    mask_generator.predictor.set_image(image_ori)
-    im_size = image_ori.shape[:-1]
-
-    transformed_points = mask_generator.predictor.transform.apply_coords(points, im_size)
-    in_points = torch.as_tensor(transformed_points, device=mask_generator.predictor.device).reshape(_np,-1,2).transpose(0,1)
-    in_labels = torch.ones((in_points.shape[0], _np), dtype=torch.int, device=mask_generator.predictor.device)
-
-    masks = sam_interactive_mask(mask_generator, points, in_points.transpose(0,1), in_labels.transpose(0,1), None)
-
-    masks = masks > 0.0
-    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
-    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
-
-    mask_data = MaskData(
-        masks=masks,
-        iou_preds=iou_preds,
-        points=points,
-    )
-
-    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
-    del masks
-
-    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
-    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
-
-    # Compress to RLE
-    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
-    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
-    del mask_data["masks"]
-    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
-
-    # Write mask records
-    outputs = []
-    for idx in range(len(mask_data["segmentations"])):
-        ann = {
-            "segmentation": mask_data["segmentations"][idx],
-            "area": area_from_rle(mask_data["rles"][idx]),
-            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
-            "predicted_iou": mask_data["iou_preds"][idx].item(),
-            "point_coords": [mask_data["points"][idx].tolist()],
-            "stability_score": mask_data["stability_score"][idx].item(),
-            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
-        }
-        outputs.append(ann)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    # for ann in sorted_anns:
-    #     mask = ann['segmentation']
-    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-    #     label += 1
-    # im = demo.get_image()
-
-    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
-    for i, ann in enumerate(sorted_anns):
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        # assign the mask to the mask_map
-        mask_map[mask == 1] = label
-        label += 1
-    im = demo.get_image()    
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im, sorted_anns
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/seem/__init__.py b/mm_agents/task_adapter/seem/__init__.py
deleted file mode 100755
index e69de29..0000000
diff --git a/mm_agents/task_adapter/seem/tasks/__init__.py b/mm_agents/task_adapter/seem/tasks/__init__.py
deleted file mode 100644
index bff9514..0000000
--- a/mm_agents/task_adapter/seem/tasks/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-from .interactive_seem_m2m_auto import *
-from .inference_seem_pano import *
-from .inference_seem_interactive import *
\ No newline at end of file
diff --git a/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py b/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
deleted file mode 100644
index 66e2317..0000000
--- a/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
+++ /dev/null
@@ -1,382 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
-
-from typing import Any, Dict, List, Optional, Tuple
-
-from segment_anything.modeling import Sam
-from segment_anything.utils.amg import (
-    MaskData,
-    area_from_rle,
-    batch_iterator,
-    batched_mask_to_box,
-    box_xyxy_to_xywh,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    coco_encode_rle,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    mask_to_rle_pytorch,
-    remove_small_regions,
-    rle_to_mask,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-    uncrop_points,
-)
-
-
-class SeemAutomaticMaskGenerator:
-    def __init__(
-        self,
-        model: Sam,
-        points_per_side: Optional[int] = 32,
-        points_per_batch: int = 64,
-        pred_iou_thresh: float = 0.9,
-        stability_score_thresh: float = 0.5,
-        stability_score_offset: float = 1.0,
-        box_nms_thresh: float = 0.7,
-        crop_n_layers: int = 0,
-        crop_nms_thresh: float = 0.7,
-        crop_overlap_ratio: float = 512 / 1500,
-        crop_n_points_downscale_factor: int = 1,
-        point_grids: Optional[List[np.ndarray]] = None,
-        min_mask_region_area: int = 0,
-        output_mode: str = "binary_mask",
-    ) -> None:
-        """
-        Using a SAM model, generates masks for the entire image.
-        Generates a grid of point prompts over the image, then filters
-        low quality and duplicate masks. The default settings are chosen
-        for SAM with a ViT-H backbone.
-
-        Arguments:
-          model (Sam): The SAM model to use for mask prediction.
-          points_per_side (int or None): The number of points to be sampled
-            along one side of the image. The total number of points is
-            points_per_side**2. If None, 'point_grids' must provide explicit
-            point sampling.
-          points_per_batch (int): Sets the number of points run simultaneously
-            by the model. Higher numbers may be faster but use more GPU memory.
-          pred_iou_thresh (float): A filtering threshold in [0,1], using the
-            model's predicted mask quality.
-          stability_score_thresh (float): A filtering threshold in [0,1], using
-            the stability of the mask under changes to the cutoff used to binarize
-            the model's mask predictions.
-          stability_score_offset (float): The amount to shift the cutoff when
-            calculated the stability score.
-          box_nms_thresh (float): The box IoU cutoff used by non-maximal
-            suppression to filter duplicate masks.
-          crop_n_layers (int): If >0, mask prediction will be run again on
-            crops of the image. Sets the number of layers to run, where each
-            layer has 2**i_layer number of image crops.
-          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
-            suppression to filter duplicate masks between different crops.
-          crop_overlap_ratio (float): Sets the degree to which crops overlap.
-            In the first crop layer, crops will overlap by this fraction of
-            the image length. Later layers with more crops scale down this overlap.
-          crop_n_points_downscale_factor (int): The number of points-per-side
-            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
-          point_grids (list(np.ndarray) or None): A list over explicit grids
-            of points used for sampling, normalized to [0,1]. The nth grid in the
-            list is used in the nth crop layer. Exclusive with points_per_side.
-          min_mask_region_area (int): If >0, postprocessing will be applied
-            to remove disconnected regions and holes in masks with area smaller
-            than min_mask_region_area. Requires opencv.
-          output_mode (str): The form masks are returned in. Can be 'binary_mask',
-            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
-            For large resolutions, 'binary_mask' may consume large amounts of
-            memory.
-        """
-
-        assert (points_per_side is None) != (
-            point_grids is None
-        ), "Exactly one of points_per_side or point_grid must be provided."
-        if points_per_side is not None:
-            self.point_grids = build_all_layer_point_grids(
-                points_per_side,
-                crop_n_layers,
-                crop_n_points_downscale_factor,
-            )
-        elif point_grids is not None:
-            self.point_grids = point_grids
-        else:
-            raise ValueError("Can't have both points_per_side and point_grid be None.")
-
-        assert output_mode in [
-            "binary_mask",
-            "uncompressed_rle",
-            "coco_rle",
-        ], f"Unknown output_mode {output_mode}."
-        if output_mode == "coco_rle":
-            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
-
-        if min_mask_region_area > 0:
-            import cv2  # type: ignore # noqa: F401
-
-        self.predictor = model
-        self.points_per_batch = points_per_batch
-        self.pred_iou_thresh = pred_iou_thresh
-        self.stability_score_thresh = stability_score_thresh
-        self.stability_score_offset = stability_score_offset
-        self.box_nms_thresh = box_nms_thresh
-        self.crop_n_layers = crop_n_layers
-        self.crop_nms_thresh = crop_nms_thresh
-        self.crop_overlap_ratio = crop_overlap_ratio
-        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
-        self.min_mask_region_area = min_mask_region_area
-        self.output_mode = output_mode
-
-        # dilate conv
-        self.dilation = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=7, stride=1, padding=3, bias=False)
-        self.dilation.weight.data.fill_(1.0)
-        self.dilation.cuda()
-
-    @torch.no_grad()
-    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
-        """
-        Generates masks for the given image.
-
-        Arguments:
-          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
-
-        Returns:
-           list(dict(str, any)): A list over records for masks. Each record is
-             a dict containing the following keys:
-               segmentation (dict(str, any) or np.ndarray): The mask. If
-                 output_mode='binary_mask', is an array of shape HW. Otherwise,
-                 is a dictionary containing the RLE.
-               bbox (list(float)): The box around the mask, in XYWH format.
-               area (int): The area in pixels of the mask.
-               predicted_iou (float): The model's own prediction of the mask's
-                 quality. This is filtered by the pred_iou_thresh parameter.
-               point_coords (list(list(float))): The point coordinates input
-                 to the model to generate this mask.
-               stability_score (float): A measure of the mask's quality. This
-                 is filtered on using the stability_score_thresh parameter.
-               crop_box (list(float)): The crop of the image used to generate
-                 the mask, given in XYWH format.
-        """
-
-        # Generate masks
-        mask_data = self._generate_masks(image)
-
-        # Filter small disconnected regions and holes in masks
-        if self.min_mask_region_area > 0:
-            mask_data = self.postprocess_small_regions(
-                mask_data,
-                self.min_mask_region_area,
-                max(self.box_nms_thresh, self.crop_nms_thresh),
-            )
-        # Encode masks
-        if self.output_mode == "coco_rle":
-            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
-        elif self.output_mode == "binary_mask":
-            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
-        else:
-            mask_data["segmentations"] = mask_data["rles"]
-
-        # Write mask records
-        curr_anns = []
-        for idx in range(len(mask_data["segmentations"])):
-            ann = {
-                "segmentation": mask_data["segmentations"][idx],
-                "area": area_from_rle(mask_data["rles"][idx]),
-                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
-                "predicted_iou": mask_data["iou_preds"][idx].item(),
-                "point_coords": [mask_data["points"][idx].tolist()],
-                "stability_score": mask_data["stability_score"][idx].item(),
-                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
-            }
-            curr_anns.append(ann)
-
-        return curr_anns
-
-    def _generate_masks(self, image: np.ndarray) -> MaskData:
-        orig_size = image.shape[-2:]
-        crop_boxes, layer_idxs = generate_crop_boxes(
-            orig_size, self.crop_n_layers, self.crop_overlap_ratio
-        )
-
-        # Iterate over image crops
-        data = MaskData()
-        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
-            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
-            data.cat(crop_data)
-
-        # Remove duplicate masks between crops
-        if len(crop_boxes) > 1:
-            # Prefer masks from smaller crops
-            scores = 1 / box_area(data["crop_boxes"])
-            scores = scores.to(data["boxes"].device)
-            keep_by_nms = batched_nms(
-                data["boxes"].float(),
-                scores,
-                torch.zeros_like(data["boxes"][:, 0]),  # categories
-                iou_threshold=self.crop_nms_thresh,
-            )
-            data.filter(keep_by_nms)
-
-        data.to_numpy()
-        return data
-
-    def _process_crop(
-        self,
-        image: np.ndarray,
-        crop_box: List[int],
-        crop_layer_idx: int,
-        orig_size: Tuple[int, ...],
-    ) -> MaskData:
-        # Crop the image and calculate embeddings
-        x0, y0, x1, y1 = crop_box
-        cropped_im = image#[y0:y1, x0:x1, :]
-        cropped_im_size = cropped_im.shape[-2:]
-        # self.predictor.set_image(cropped_im)
-
-        # Get points for this crop
-        points_scale = np.array(cropped_im_size)[None, ::-1]
-        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
-
-        # Generate masks for this crop in batches
-        data = MaskData()
-        self.enc_features=None
-
-        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
-            batch_data = self._process_batch(cropped_im, points, cropped_im_size, crop_box, orig_size)
-            data.cat(batch_data)
-            del batch_data
-
-        # Remove duplicates within this crop.
-        keep_by_nms = batched_nms(
-            data["boxes"].float(),
-            data["iou_preds"],
-            torch.zeros(len(data["boxes"])),  # categories
-            iou_threshold=self.box_nms_thresh,
-        )
-
-        data.filter(keep_by_nms)
-
-        # Return to the original image frame
-        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
-        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
-
-        return data
-
-    def _process_batch(
-        self,
-        images,
-        points: np.ndarray,
-        im_size: Tuple[int, ...],
-        crop_box: List[int],
-        orig_size: Tuple[int, ...],
-    ) -> MaskData:
-        orig_h, orig_w = orig_size
-
-        data = {"image": images, "height": orig_h, "width": orig_w}
-        points = torch.tensor(points,dtype=torch.float).to(images.device)
-        
-        # prepare interactive mask for seem
-        abs_points = (points * torch.tensor(orig_size)[None,:].to(points.device)).long()
-        abs_masks = torch.zeros((len(points), orig_h, orig_w), dtype=torch.bool).to(device=points.device)
-        abs_masks[torch.arange(0, abs_points.size(0))[:,None], abs_points[:,0:1], abs_points[:,1:2]] = True
-        abs_masks = self.dilation(abs_masks[:,None].float())[:,0] > 0
-        data['spatial_query'] = {'rand_shape': abs_masks[:,None]}
-
-        batch_inputs = [data]
-        if self.enc_features is None:
-            masks, iou_preds, mask_features, transformer_encoder_features, multi_scale_features = self.predictor.model.evaluate_demo(batch_inputs, None, None, return_features=True)
-            self.enc_features = (mask_features, transformer_encoder_features, multi_scale_features)
-        else:
-            masks, iou_preds = self.predictor.model.evaluate_demo(batch_inputs, self.enc_features[0], self.enc_features[1], self.enc_features[2])
-
-        data = MaskData(
-            masks=masks,
-            iou_preds=iou_preds,
-            points=points,
-        )
-        del masks
-        # Filter by predicted IoU
-        if self.pred_iou_thresh > 0.0:
-            keep_mask = data["iou_preds"] > self.pred_iou_thresh
-            data.filter(keep_mask)
-
-        # Calculate stability score
-        data["stability_score"] = calculate_stability_score(
-            data["masks"], 0.0, self.stability_score_offset
-        )
-        if self.stability_score_thresh > 0.0:
-            keep_mask = data["stability_score"] >= self.stability_score_thresh
-            data.filter(keep_mask)
-
-        # Threshold masks and calculate boxes
-        data["masks"] = data["masks"] > 0.0
-        data["boxes"] = batched_mask_to_box(data["masks"])
-
-        # Filter boxes that touch crop boundaries
-        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
-        if not torch.all(keep_mask):
-            data.filter(keep_mask)
-
-        # Compress to RLE
-        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
-        data["rles"] = mask_to_rle_pytorch(data["masks"])
-        del data["masks"]
-
-        return data
-
-    @staticmethod
-    def postprocess_small_regions(
-        mask_data: MaskData, min_area: int, nms_thresh: float
-    ) -> MaskData:
-        """
-        Removes small disconnected regions and holes in masks, then reruns
-        box NMS to remove any new duplicates.
-
-        Edits mask_data in place.
-
-        Requires open-cv as a dependency.
-        """
-        if len(mask_data["rles"]) == 0:
-            return mask_data
-
-        # Filter small disconnected regions and holes
-        new_masks = []
-        scores = []
-        for rle in mask_data["rles"]:
-            mask = rle_to_mask(rle)
-
-            mask, changed = remove_small_regions(mask, min_area, mode="holes")
-            unchanged = not changed
-            mask, changed = remove_small_regions(mask, min_area, mode="islands")
-            unchanged = unchanged and not changed
-
-            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
-            # Give score=0 to changed masks and score=1 to unchanged masks
-            # so NMS will prefer ones that didn't need postprocessing
-            scores.append(float(unchanged))
-
-        # Recalculate boxes and remove any new duplicates
-        masks = torch.cat(new_masks, dim=0)
-        boxes = batched_mask_to_box(masks)
-        keep_by_nms = batched_nms(
-            boxes.float(),
-            torch.as_tensor(scores),
-            torch.zeros_like(boxes[:, 0]),  # categories
-            iou_threshold=nms_thresh,
-        )
-
-        # Only recalculate RLEs for masks that have changed
-        for i_mask in keep_by_nms:
-            if scores[i_mask] == 0.0:
-                mask_torch = masks[i_mask].unsqueeze(0)
-                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
-                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
-        mask_data.filter(keep_by_nms)
-
-        return mask_data
\ No newline at end of file
diff --git a/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py b/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
deleted file mode 100644
index a4b3ce9..0000000
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import torch.nn.functional as F
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-import matplotlib.pyplot as plt
-import cv2
-import io
-from .automatic_mask_generator import SeemAutomaticMaskGenerator
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-from segment_anything.utils.amg import (
-    MaskData,
-    area_from_rle,
-    batch_iterator,
-    batched_mask_to_box,
-    box_xyxy_to_xywh,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    coco_encode_rle,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    mask_to_rle_pytorch,
-    remove_small_regions,
-    rle_to_mask,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-    uncrop_points,
-)
-
-
-def inference_seem_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-
-    orig_size = images.shape[-2:]
-    orig_h, orig_w = orig_size
-    crop_box = [0,0,orig_w,orig_h]
-
-    data = {"image": images, "height": orig_h, "width": orig_w}
-
-    spatial_masks = spatial_masks[:, None].float().cuda()
-    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
-    data['spatial_query'] = {'rand_shape': spatial_masks}
-
-    model.model.metadata = metadata
-    masks, _ = model.model.evaluate_demo([data])
-    masks = masks > 0.0
-    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
-    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
-
-    mask_data = MaskData(
-        masks=masks,
-        iou_preds=iou_preds,
-        points=points,
-    )
-
-    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
-    del masks
-
-    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
-    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
-
-    # Compress to RLE
-    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
-    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
-    del mask_data["masks"]
-    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
-
-    # Write mask records
-    outputs = []
-    for idx in range(len(mask_data["segmentations"])):
-        ann = {
-            "segmentation": mask_data["segmentations"][idx],
-            "area": area_from_rle(mask_data["rles"][idx]),
-            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
-            "predicted_iou": mask_data["iou_preds"][idx].item(),
-            "point_coords": [mask_data["points"][idx].tolist()],
-            "stability_score": mask_data["stability_score"][idx].item(),
-            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
-        }
-        outputs.append(ann)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    # for ann in sorted_anns:
-    #     mask = ann['segmentation']
-    #     color_mask = np.random.random((1, 3)).tolist()[0]
-    #     # color_mask = [int(c*255) for c in color_mask]
-    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-    #     label += 1
-    # im = demo.get_image()
-
-    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
-    for i, ann in enumerate(sorted_anns):
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        # assign the mask to the mask_map
-        mask_map[mask == 1] = label
-        label += 1
-    im = demo.get_image()
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im, sorted_anns
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py b/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
deleted file mode 100644
index d75af48..0000000
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
+++ /dev/null
@@ -1,164 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-import matplotlib.pyplot as plt
-import cv2
-import io
-from .automatic_mask_generator import SeemAutomaticMaskGenerator
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-from segment_anything.utils.amg import (
-    MaskData,
-    area_from_rle,
-    batch_iterator,
-    batched_mask_to_box,
-    box_xyxy_to_xywh,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    coco_encode_rle,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    mask_to_rle_pytorch,
-    remove_small_regions,
-    rle_to_mask,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-    uncrop_points,
-)
-
-
-def inference_seem_pano(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-
-    orig_size = images.shape[-2:]
-    orig_h, orig_w = orig_size
-    crop_box = [0,0,orig_w,orig_h]
-
-    data = {"image": images, "height": orig_h, "width": orig_w}
-    batch_inputs = [data]
-
-    model.model.metadata = metadata
-    outputs = model.model.evaluate(batch_inputs)
-
-    pano_mask = outputs[0]['panoptic_seg'][0]
-    pano_info = outputs[0]['panoptic_seg'][1]
-
-    masks = []
-    for seg_info in pano_info:
-        masks += [pano_mask == seg_info['id']]
-    masks = torch.stack(masks, dim=0)
-    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
-    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
-
-    mask_data = MaskData(
-        masks=masks,
-        iou_preds=iou_preds,
-        points=points,
-    )
-    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
-    del masks
-
-    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
-    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
-
-    # Compress to RLE
-    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
-    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
-    del mask_data["masks"]
-    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
-
-    # Write mask records
-    outputs = []
-    for idx in range(len(mask_data["segmentations"])):
-        ann = {
-            "segmentation": mask_data["segmentations"][idx],
-            "area": area_from_rle(mask_data["rles"][idx]),
-            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
-            "predicted_iou": mask_data["iou_preds"][idx].item(),
-            "point_coords": [mask_data["points"][idx].tolist()],
-            "stability_score": mask_data["stability_score"][idx].item(),
-            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
-        }
-        outputs.append(ann)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    # create a full zero image as the image_orig
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
-    for i, ann in enumerate(sorted_anns):
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        # assign the mask to the mask_map
-        mask_map[mask == 1] = label
-        label += 1
-    im = demo.get_image()
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im, sorted_anns
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py b/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
deleted file mode 100644
index f35a6a4..0000000
--- a/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
+++ /dev/null
@@ -1,93 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-import matplotlib.pyplot as plt
-import cv2
-import io
-from .automatic_mask_generator import SeemAutomaticMaskGenerator
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-def interactive_seem_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-
-    mask_generator = SeemAutomaticMaskGenerator(model)
-    outputs = mask_generator.generate(images)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    for ann in sorted_anns:
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        label += 1
-    im = demo.get_image()
-
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/__init__.py b/mm_agents/task_adapter/semantic_sam/tasks/__init__.py
deleted file mode 100644
index 08e1951..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-from .interactive_idino_m2m import interactive_infer_image as interactive_infer_image_idino_m2m
-from .interactive_idino_m2m import interactive_infer_image_semantic, interactive_infer_image_3l
-from .inference_semsam_m2m_auto import inference_semsam_m2m_auto
-from .interactive_idino_1o1_box import interactive_infer_image_box as interactive_infer_image_idino_m2m_box
-from .automatic_mask_generator import prompt_switch
-from .interactive_predictor import SemanticSAMPredictor
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/automatic_mask_generator.py b/mm_agents/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
deleted file mode 100644
index fe28899..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
+++ /dev/null
@@ -1,393 +0,0 @@
-# Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
-
-# This source code is licensed under the license found in the
-# LICENSE file in the root directory of this source tree.
-
-import numpy as np
-import torch
-from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
-
-from typing import Any, Dict, List, Optional, Tuple
-# from
-# from .modeling import Sam
-# from .predictor import SamPredictor
-from semantic_sam.utils.sam_utils.amg import (
-    MaskData,
-    area_from_rle,
-    batch_iterator,
-    batched_mask_to_box,
-    box_xyxy_to_xywh,
-    build_all_layer_point_grids,
-    calculate_stability_score,
-    coco_encode_rle,
-    generate_crop_boxes,
-    is_box_near_crop_edge,
-    mask_to_rle_pytorch,
-    remove_small_regions,
-    rle_to_mask,
-    uncrop_boxes_xyxy,
-    uncrop_masks,
-    uncrop_points,
-)
-
-
-def prompt_switch(p):
-    p = int(p)
-    if p == 1:
-        return 3
-    if p == 2:
-        return 2
-    if p == 3:
-        return 0
-    if p == 4:
-        return 4
-    if p == 5:
-        return 1
-    if p == 6:
-        return 5
-    else:
-        raise NotImplementedError
-
-
-class SemanticSamAutomaticMaskGenerator:
-    def __init__(
-        self,
-        model,
-        points_per_side: Optional[int] = 32,
-        points_per_batch: int = 200,
-        pred_iou_thresh: float = 0.88,
-        stability_score_thresh: float = 0.92,
-        stability_score_offset: float = 1.0,
-        box_nms_thresh: float = 0.7,
-        crop_n_layers: int = 0,
-        crop_nms_thresh: float = 0.7,
-        crop_overlap_ratio: float = 512 / 1500,
-        crop_n_points_downscale_factor: int = 1,
-        point_grids: Optional[List[np.ndarray]] = None,
-        min_mask_region_area: int = 10,
-        output_mode: str = "binary_mask",
-        level: list = [1, 2, 3, 4, 5, 6],
-    ) -> None:
-        """
-        Using a SAM model, generates masks for the entire image.
-        Generates a grid of point prompts over the image, then filters
-        low quality and duplicate masks. The default settings are chosen
-        for SAM with a ViT-H backbone.
-
-        Arguments:
-          model (Sam): The SAM model to use for mask prediction.
-          points_per_side (int or None): The number of points to be sampled
-            along one side of the image. The total number of points is
-            points_per_side**2. If None, 'point_grids' must provide explicit
-            point sampling.
-          points_per_batch (int): Sets the number of points run simultaneously
-            by the model. Higher numbers may be faster but use more GPU memory.
-          pred_iou_thresh (float): A filtering threshold in [0,1], using the
-            model's predicted mask quality.
-          stability_score_thresh (float): A filtering threshold in [0,1], using
-            the stability of the mask under changes to the cutoff used to binarize
-            the model's mask predictions.
-          stability_score_offset (float): The amount to shift the cutoff when
-            calculated the stability score.
-          box_nms_thresh (float): The box IoU cutoff used by non-maximal
-            suppression to filter duplicate masks.
-          crops_n_layers (int): If >0, mask prediction will be run again on
-            crops of the image. Sets the number of layers to run, where each
-            layer has 2**i_layer number of image crops.
-          crops_nms_thresh (float): The box IoU cutoff used by non-maximal
-            suppression to filter duplicate masks between different crops.
-          crop_overlap_ratio (float): Sets the degree to which crops overlap.
-            In the first crop layer, crops will overlap by this fraction of
-            the image length. Later layers with more crops scale down this overlap.
-          crop_n_points_downscale_factor (int): The number of points-per-side
-            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
-          point_grids (list(np.ndarray) or None): A list over explicit grids
-            of points used for sampling, normalized to [0,1]. The nth grid in the
-            list is used in the nth crop layer. Exclusive with points_per_side.
-          min_mask_region_area (int): If >0, postprocessing will be applied
-            to remove disconnected regions and holes in masks with area smaller
-            than min_mask_region_area. Requires opencv.
-          output_mode (str): The form masks are returned in. Can be 'binary_mask',
-            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
-            For large resolutions, 'binary_mask' may consume large amounts of
-            memory.
-        """
-        self.level = [prompt_switch(l) for l in level]
-        assert (points_per_side is None) != (
-            point_grids is None
-        ), "Exactly one of points_per_side or point_grid must be provided."
-        if points_per_side is not None:
-            self.point_grids = build_all_layer_point_grids(
-                points_per_side,
-                crop_n_layers,
-                crop_n_points_downscale_factor,
-            )
-        elif point_grids is not None:
-            self.point_grids = point_grids
-        else:
-            raise ValueError("Can't have both points_per_side and point_grid be None.")
-
-        assert output_mode in [
-            "binary_mask",
-            "uncompressed_rle",
-            "coco_rle",
-        ], f"Unknown output_mode {output_mode}."
-        if output_mode == "coco_rle":
-            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
-
-        if min_mask_region_area > 0:
-            import cv2  # type: ignore # noqa: F401
-
-        self.predictor = model
-        self.points_per_batch = points_per_batch
-        self.pred_iou_thresh = pred_iou_thresh
-        self.stability_score_thresh = stability_score_thresh
-        self.stability_score_offset = stability_score_offset
-        self.box_nms_thresh = box_nms_thresh
-        self.crop_n_layers = crop_n_layers
-        self.crop_nms_thresh = crop_nms_thresh
-        self.crop_overlap_ratio = crop_overlap_ratio
-        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
-        self.min_mask_region_area = min_mask_region_area
-        self.output_mode = output_mode
-
-    @torch.no_grad()
-    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
-        """
-        Generates masks for the given image.
-
-        Arguments:
-          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
-
-        Returns:
-           list(dict(str, any)): A list over records for masks. Each record is
-             a dict containing the following keys:
-               segmentation (dict(str, any) or np.ndarray): The mask. If
-                 output_mode='binary_mask', is an array of shape HW. Otherwise,
-                 is a dictionary containing the RLE.
-               bbox (list(float)): The box around the mask, in XYWH format.
-               area (int): The area in pixels of the mask.
-               predicted_iou (float): The model's own prediction of the mask's
-                 quality. This is filtered by the pred_iou_thresh parameter.
-               point_coords (list(list(float))): The point coordinates input
-                 to the model to generate this mask.
-               stability_score (float): A measure of the mask's quality. This
-                 is filtered on using the stability_score_thresh parameter.
-               crop_box (list(float)): The crop of the image used to generate
-                 the mask, given in XYWH format.
-        """
-
-        # Generate masks
-        mask_data = self._generate_masks(image)
-
-        # Filter small disconnected regions and holes in masks
-        if self.min_mask_region_area > 0:
-            mask_data = self.postprocess_small_regions(
-                mask_data,
-                self.min_mask_region_area,
-                max(self.box_nms_thresh, self.crop_nms_thresh),
-            )
-        # Encode masks
-        if self.output_mode == "coco_rle":
-            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
-        elif self.output_mode == "binary_mask":
-            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
-        else:
-            mask_data["segmentations"] = mask_data["rles"]
-
-        # Write mask records
-        curr_anns = []
-        for idx in range(len(mask_data["segmentations"])):
-            ann = {
-                "segmentation": mask_data["segmentations"][idx],
-                "area": area_from_rle(mask_data["rles"][idx]),
-                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
-                "predicted_iou": mask_data["iou_preds"][idx].item(),
-                "point_coords": [mask_data["points"][idx].tolist()],
-                "stability_score": mask_data["stability_score"][idx].item(),
-                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
-            }
-            curr_anns.append(ann)
-
-        return curr_anns
-
-    def _generate_masks(self, image: np.ndarray) -> MaskData:
-        orig_size = image.shape[-2:]
-        crop_boxes, layer_idxs = generate_crop_boxes(
-            orig_size, self.crop_n_layers, self.crop_overlap_ratio
-        )
-
-        # Iterate over image crops
-        assert len(crop_boxes)==1
-        data = MaskData()
-        # import ipdb; ipdb.set_trace()
-        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
-            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
-
-            data.cat(crop_data)
-        # import ipdb; ipdb.set_trace()
-        # Remove duplicate masks between crops
-        if len(crop_boxes) > 1:
-            # Prefer masks from smaller crops
-            scores = 1 / box_area(data["crop_boxes"])
-            scores = scores.to(data["boxes"].device)
-            keep_by_nms = batched_nms(
-                data["boxes"].float(),
-                scores,
-                torch.zeros(len(data["boxes"])),  # categories
-                iou_threshold=self.crop_nms_thresh,
-            )
-            data.filter(keep_by_nms)
-
-        data.to_numpy()
-        return data
-
-    def _process_crop(
-        self,
-        image: np.ndarray,
-        crop_box: List[int],
-        crop_layer_idx: int,
-        orig_size: Tuple[int, ...],
-    ) -> MaskData:
-        # Crop the image and calculate embeddings
-        x0, y0, x1, y1 = crop_box
-        cropped_im = image#[y0:y1, x0:x1, :]
-        cropped_im_size = cropped_im.shape[-2:]
-        # self.predictor.set_image(cropped_im)
-
-        # Get points for this crop
-        points_scale = np.array(cropped_im_size)[None, ::-1]
-        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
-
-        # Generate masks for this crop in batches
-        data = MaskData()
-        self.enc_features=None
-        # import ipdb; ipdb.set_trace()
-        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
-            batch_data = self._process_batch(cropped_im,points, cropped_im_size, crop_box, orig_size)
-            data.cat(batch_data)
-            del batch_data
-
-        keep_by_nms = batched_nms(
-            data["boxes"].float(),
-            data["iou_preds"],
-            torch.zeros(len(data["boxes"])),  # categories
-            iou_threshold=self.box_nms_thresh,
-        )
-        # import ipdb; ipdb.set_trace()
-        data.filter(keep_by_nms)
-        # import ipdb; ipdb.set_trace()
-        # Return to the original image frame
-        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
-        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
-
-        return data
-
-    def _process_batch(
-        self,
-        images,
-        points: np.ndarray,
-        im_size: Tuple[int, ...],
-        crop_box: List[int],
-        orig_size: Tuple[int, ...],
-    ) -> MaskData:
-        orig_h, orig_w = orig_size
-
-        data = {"image": images, "height": orig_h, "width": orig_w}
-        points=torch.tensor(points,dtype=torch.float).to(images.device)
-        points = torch.cat([points, points.new_tensor([[0.005, 0.005]]).repeat(len(points), 1)], dim=-1)
-        data['targets'] = [dict()]
-        data['targets'][0]['points']=points
-        data['targets'][0]['pb']=points.new_tensor([0.]*len(points))
-        batch_inputs = [data]
-        if self.enc_features is None:
-            masks, iou_preds,mask_features,multi_scale_features= self.predictor.model.evaluate_demo(batch_inputs,None,None,return_features=True, level=self.level)
-            self.enc_features=(mask_features,multi_scale_features)
-        else:
-            masks, iou_preds= self.predictor.model.evaluate_demo(batch_inputs,None,None,self.enc_features[0],self.enc_features[1], level=self.level)
-
-        data = MaskData(
-            masks=masks,
-            iou_preds=iou_preds.flatten(),
-            points=torch.as_tensor(points[:,None].repeat(1,len(self.level), 1).view(-1,4)),
-        )
-        del masks
-        # Filter by predicted IoU
-        keep_mask = data["iou_preds"] > self.pred_iou_thresh
-        data.filter(keep_mask)
-
-        # Calculate stability score
-        data["stability_score"] = calculate_stability_score(
-            data["masks"], 0.0, self.stability_score_offset
-        )
-        # if self.stability_score_thresh > 0.0:
-        keep_mask = data["stability_score"] >= self.stability_score_thresh
-        data.filter(keep_mask)
-
-        # Threshold masks and calculate boxes
-        data["masks"] = data["masks"] > 0.0
-        data["boxes"] = batched_mask_to_box(data["masks"])
-
-        # Filter boxes that touch crop boundaries
-        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
-        if not torch.all(keep_mask):
-            data.filter(keep_mask)
-
-        # Compress to RLE
-        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
-        data["rles"] = mask_to_rle_pytorch(data["masks"])
-        del data["masks"]
-
-        return data
-
-    @staticmethod
-    def postprocess_small_regions(
-        mask_data: MaskData, min_area: int, nms_thresh: float
-    ) -> MaskData:
-        """
-        Removes small disconnected regions and holes in masks, then reruns
-        box NMS to remove any new duplicates.
-
-        Edits mask_data in place.
-
-        Requires open-cv as a dependency.
-        """
-        if len(mask_data["rles"]) == 0:
-            return mask_data
-
-        # Filter small disconnected regions and holes
-        new_masks = []
-        scores = []
-        for rle in mask_data["rles"]:
-            mask = rle_to_mask(rle)
-
-            mask, changed = remove_small_regions(mask, min_area, mode="holes")
-            unchanged = not changed
-            mask, changed = remove_small_regions(mask, min_area, mode="islands")
-            unchanged = unchanged and not changed
-
-            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
-            # Give score=0 to changed masks and score=1 to unchanged masks
-            # so NMS will prefer ones that didn't need postprocessing
-            scores.append(float(unchanged))
-
-        # Recalculate boxes and remove any new duplicates
-        masks = torch.cat(new_masks, dim=0)
-        boxes = batched_mask_to_box(masks)
-        keep_by_nms = batched_nms(
-            boxes.float(),
-            torch.as_tensor(scores),
-            torch.zeros(len(boxes)),  # categories
-            iou_threshold=nms_thresh,
-        )
-
-        # Only recalculate RLEs for masks that have changed
-        for i_mask in keep_by_nms:
-            if scores[i_mask] == 0.0:
-                mask_torch = masks[i_mask].unsqueeze(0)
-                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
-                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
-        mask_data.filter(keep_by_nms)
-
-        return mask_data
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py b/mm_agents/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
deleted file mode 100644
index a939a3c..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
+++ /dev/null
@@ -1,108 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-import matplotlib.pyplot as plt
-import cv2
-import io
-from .automatic_mask_generator import SemanticSamAutomaticMaskGenerator
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-def inference_semsam_m2m_auto(model, image, level, all_classes, all_parts, thresh, text_size, hole_scale, island_scale, semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image)
-
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-
-    mask_generator = SemanticSamAutomaticMaskGenerator(model,points_per_side=32,
-            pred_iou_thresh=0.88,
-            stability_score_thresh=0.92,
-            min_mask_region_area=10,
-            level=level,
-        )
-    outputs = mask_generator.generate(images)
-
-    from task_adapter.utils.visualizer import Visualizer
-    visual = Visualizer(image_ori, metadata=metadata)
-    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
-    label = 1
-    # for ann in sorted_anns:
-    #     mask = ann['segmentation']
-    #     color_mask = np.random.random((1, 3)).tolist()[0]
-    #     # color_mask = [int(c*255) for c in color_mask]
-    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-    #     label += 1
-    # im = demo.get_image()
-
-    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
-    for i, ann in enumerate(sorted_anns):
-        mask = ann['segmentation']
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        # color_mask = [int(c*255) for c in color_mask]
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        # assign the mask to the mask_map
-        mask_map[mask == 1] = label
-        label += 1
-    im = demo.get_image()    
-    # fig=plt.figure(figsize=(10, 10))
-    # plt.imshow(image_ori)
-    # show_anns(outputs)
-    # fig.canvas.draw()
-    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
-    return im, sorted_anns
-
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
-
-def show_anns(anns):
-    if len(anns) == 0:
-        return
-    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
-    ax = plt.gca()
-    ax.set_autoscale_on(False)
-    polygons = []
-    color = []
-    for ann in sorted_anns:
-        m = ann['segmentation']
-        img = np.ones((m.shape[0], m.shape[1], 3))
-        color_mask = np.random.random((1, 3)).tolist()[0]
-        for i in range(3):
-            img[:,:,i] = color_mask[i]
-        ax.imshow(np.dstack((img, m*0.35)))
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py b/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
deleted file mode 100644
index ccfe774..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-from detectron2.structures import BitMasks
-from semantic_sam.utils import box_ops
-
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-def interactive_infer_image_box(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image['image'])
-    mask_ori = transform1(image['mask'])
-    width = image_ori.size[0]
-    height = image_ori.size[1]
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
-
-
-    data = {"image": images, "height": height, "width": width}
-
-    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
-    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
-    flaten_mask = mask_ori.unsqueeze(0)
-    # import ipdb; ipdb.set_trace()
-    points=mask_ori.nonzero().float().to(images.device)
-    if len(points)==0:
-        point_=point=points.new_tensor([[0.5,0.5,0.5,0.5]])
-    else:
-        mean_point=points.mean(0)[None]
-        box_xyxy = BitMasks(flaten_mask > 0).get_bounding_boxes().tensor
-        h = mask_ori.shape[0]
-        w = mask_ori.shape[1]
-        box_xywh = (box_ops.box_xyxy_to_cxcywh(box_xyxy) / torch.as_tensor([w, h, w, h])).cuda()
-
-        # point_=points.mean(0)[None]
-        # point=point_.clone()
-        # point[0, 0] = point_[0, 0] / mask_ori.shape[0]
-        # point[0, 1] = point_[0, 1] / mask_ori.shape[1]
-        # point = point[:, [1, 0]]
-        point=box_xywh
-    data['targets'] = [dict()]
-    data['targets'][0]['points']=point
-    data['targets'][0]['pb']=point.new_tensor([1.])
-
-
-    batch_inputs = [data]
-    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, task='demo_box')
-
-    pred_masks_poses = masks
-    reses=[]
-    ious=ious[0,0]
-    ids=torch.argsort(ious,descending=True)
-
-    text_res=''
-    try:
-        thresh=float(thresh)
-    except Exception:
-        thresh=0.0
-    mask_ls=[]
-    ious_res=[]
-    areas=[]
-    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
-        iou=round(float(iou),2)
-        texts=f'{iou}'
-        mask=(pred_masks_pos>0.0).cpu().numpy()
-        area=mask.sum()
-        conti=False
-        if iou<thresh:
-            conti=True
-        for m in mask_ls:
-            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
-                conti=True
-                break
-        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
-            conti=False
-        if conti:
-            continue
-        ious_res.append(iou)
-        mask_ls.append(mask)
-        areas.append(area)
-        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
-        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
-        mask=(mask).astype(np.float)
-        out_txt = texts
-        visual = Visualizer(image_ori, metadata=metadata)
-        color=[0.,0.,1.0]
-        demo = visual.draw_binary_mask(mask, color=color, text=texts)
-        demo = visual.draw_box(box_xyxy[0])
-        res = demo.get_image()
-        # point_x0=max(0,int(point_[0, 1])-3)
-        # point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
-        # point_y0 = max(0, int(point_[0, 0]) - 3)
-        # point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
-        # res[point_y0:point_y1,point_x0:point_x1,0]=255
-        # res[point_y0:point_y1,point_x0:point_x1,1]=0
-        # res[point_y0:point_y1,point_x0:point_x1,2]=0
-        reses.append(Image.fromarray(res))
-        text_res=text_res+';'+out_txt
-    ids=list(torch.argsort(torch.tensor(areas),descending=False))
-    ids = [int(i) for i in ids]
-
-    torch.cuda.empty_cache()
-
-    return reses,[reses[i] for i in ids]
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py b/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
deleted file mode 100644
index 93775c3..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
+++ /dev/null
@@ -1,322 +0,0 @@
-# --------------------------------------------------------
-# Semantic-SAM: Segment and Recognize Anything at Any Granularity
-# Copyright (c) 2023 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Hao Zhang (hzhangcx@connect.ust.hk)
-# --------------------------------------------------------
-
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-def interactive_infer_image(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image['image'])
-    mask_ori = transform1(image['mask'])
-    width = image_ori.size[0]
-    height = image_ori.size[1]
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
-
-
-    data = {"image": images, "height": height, "width": width}
-
-    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
-    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
-    points=mask_ori.nonzero().float().to(images.device)
-    if len(points)==0:
-        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
-    else:
-        point_=points.mean(0)[None]
-        point=point_.clone()
-        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
-        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
-        point = point[:, [1, 0]]
-        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
-    data['targets'] = [dict()]
-    data['targets'][0]['points']=point
-    data['targets'][0]['pb']=point.new_tensor([0.])
-
-
-    batch_inputs = [data]
-    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
-
-    pred_masks_poses = masks
-    reses=[]
-    ious=ious[0,0]
-    ids=torch.argsort(ious,descending=True)
-
-    text_res=''
-    try:
-        thresh=float(thresh)
-    except Exception:
-        thresh=0.0
-    mask_ls=[]
-    ious_res=[]
-    areas=[]
-    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
-        iou=round(float(iou),2)
-        texts=f'{iou}'
-        mask=(pred_masks_pos>0.0).cpu().numpy()
-        area=mask.sum()
-        conti=False
-        if iou<thresh:
-            conti=True
-        for m in mask_ls:
-            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
-                conti=True
-                break
-        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
-            conti=False
-        if conti:
-            continue
-        ious_res.append(iou)
-        mask_ls.append(mask)
-        areas.append(area)
-        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
-        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
-        mask=(mask).astype(np.float)
-        out_txt = texts
-        visual = Visualizer(image_ori, metadata=metadata)
-        color=[0.,0.,1.0]
-        # demo = visual.draw_binary_mask(mask, color=color, text=texts)
-        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
-        res = demo.get_image()
-        point_x0=max(0,int(point_[0, 1])-3)
-        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
-        point_y0 = max(0, int(point_[0, 0]) - 3)
-        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
-        # res[point_y0:point_y1,point_x0:point_x1,0]=255
-        # res[point_y0:point_y1,point_x0:point_x1,1]=0
-        # res[point_y0:point_y1,point_x0:point_x1,2]=0
-        reses.append(Image.fromarray(res))
-        text_res=text_res+';'+out_txt
-    ids=list(torch.argsort(torch.tensor(areas),descending=False))
-    ids = [int(i) for i in ids]
-
-    torch.cuda.empty_cache()
-
-    return reses,[reses[i] for i in ids]
-
-def interactive_infer_image_3l(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image['image'])
-    mask_ori = transform1(image['mask'])
-    width = image_ori.size[0]
-    height = image_ori.size[1]
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
-
-
-    data = {"image": images, "height": height, "width": width}
-
-    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
-    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
-    points=mask_ori.nonzero().float().to(images.device)
-    if len(points)==0:
-        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
-    else:
-        point_=points.mean(0)[None]
-        point=point_.clone()
-        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
-        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
-        point = point[:, [1, 0]]
-        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
-    data['targets'] = [dict()]
-    data['targets'][0]['points']=point
-    data['targets'][0]['pb']=point.new_tensor([0.])
-
-
-    batch_inputs = [data]
-    masks, ious, pred_class, pred_class_score = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, level=[0,1,2])
-
-    pred_masks_poses = masks
-    reses=[]
-    ious=ious[0,0]
-    ids=torch.argsort(ious,descending=True)
-
-    text_res=''
-    try:
-        thresh=float(thresh)
-    except Exception:
-        thresh=0.0
-    mask_ls=[]
-    ious_res=[]
-    areas=[]
-    new_pred_class = []
-    new_pred_class_score = []
-    for i in ids:
-        new_pred_class_score.append(pred_class_score[i])
-        new_pred_class.append(pred_class[i])
-    # import ipdb; ipdb.set_trace()
-    for i,(pred_masks_pos,iou, cls_name, cls_score) in enumerate(zip(pred_masks_poses[ids],ious[ids], new_pred_class, new_pred_class_score)):
-        iou=round(float(iou),2)
-        texts=f'{iou}_{cls_name}_{cls_score}'
-        mask=(pred_masks_pos>0.0).cpu().numpy()
-        area=mask.sum()
-        conti=False
-        if iou<thresh:
-            conti=True
-        for m in mask_ls:
-            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
-                conti=True
-                break
-        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
-            conti=False
-        if conti:
-            continue
-        ious_res.append(iou)
-        mask_ls.append(mask)
-        areas.append(area)
-        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
-        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
-        mask=(mask).astype(np.float)
-        out_txt = texts
-        visual = Visualizer(image_ori, metadata=metadata)
-        color=[0.,0.,1.0]
-        demo = visual.draw_binary_mask(mask, color=color, text=texts)
-        res = demo.get_image()
-        point_x0=max(0,int(point_[0, 1])-3)
-        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
-        point_y0 = max(0, int(point_[0, 0]) - 3)
-        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
-        res[point_y0:point_y1,point_x0:point_x1,0]=255
-        res[point_y0:point_y1,point_x0:point_x1,1]=0
-        res[point_y0:point_y1,point_x0:point_x1,2]=0
-        reses.append(Image.fromarray(res))
-        text_res=text_res+';'+out_txt
-    ids=list(torch.argsort(torch.tensor(areas),descending=False))
-    ids = [int(i) for i in ids]
-
-    torch.cuda.empty_cache()
-
-    return reses,[reses[i] for i in ids]
-
-def interactive_infer_image_semantic(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
-    t = []
-    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
-    transform1 = transforms.Compose(t)
-    image_ori = transform1(image['image'])
-    mask_ori = transform1(image['mask'])
-    width = image_ori.size[0]
-    height = image_ori.size[1]
-    image_ori = np.asarray(image_ori)
-    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
-    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
-
-
-    data = {"image": images, "height": height, "width": width}
-
-    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
-    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
-    points=mask_ori.nonzero().float().to(images.device)
-    if len(points)==0:
-        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
-    else:
-        point_=points.mean(0)[None]
-        point=point_.clone()
-        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
-        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
-        point = point[:, [1, 0]]
-        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
-    data['targets'] = [dict()]
-    data['targets'][0]['points']=point
-    data['targets'][0]['pb']=point.new_tensor([0.])
-    data['targets'][0]['pb']=point.new_tensor([1.])
-
-
-    batch_inputs = [data]
-    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
-
-    pred_masks_poses = masks
-    reses=[]
-    ious=ious[0,0]
-    ids=torch.argsort(ious,descending=True)
-
-    text_res=''
-    try:
-        thresh=float(thresh)
-    except Exception:
-        thresh=0.0
-    mask_ls=[]
-    ious_res=[]
-    areas=[]
-    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
-        iou=round(float(iou),2)
-        texts=f'{iou}'
-        mask=(pred_masks_pos>0.0).cpu().numpy()
-        area=mask.sum()
-        conti=False
-        if iou<thresh:
-            conti=True
-        for m in mask_ls:
-            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
-                conti=True
-                break
-        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
-            conti=False
-        if conti:
-            continue
-        ious_res.append(iou)
-        mask_ls.append(mask)
-        areas.append(area)
-        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
-        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
-        mask=(mask).astype(np.float)
-        out_txt = texts
-        visual = Visualizer(image_ori, metadata=metadata)
-        color=[0.,0.,1.0]
-        demo = visual.draw_binary_mask(mask, color=color, text=texts)
-        res = demo.get_image()
-        point_x0=max(0,int(point_[0, 1])-3)
-        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
-        point_y0 = max(0, int(point_[0, 0]) - 3)
-        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
-        res[point_y0:point_y1,point_x0:point_x1,0]=255
-        res[point_y0:point_y1,point_x0:point_x1,1]=0
-        res[point_y0:point_y1,point_x0:point_x1,2]=0
-        reses.append(Image.fromarray(res))
-        text_res=text_res+';'+out_txt
-    ids=list(torch.argsort(torch.tensor(areas),descending=False))
-    ids = [int(i) for i in ids]
-
-    torch.cuda.empty_cache()
-
-    return reses,[reses[i] for i in ids]
-
-def remove_small_regions(
-    mask: np.ndarray, area_thresh: float, mode: str
-) -> Tuple[np.ndarray, bool]:
-    """
-    Removes small disconnected regions and holes in a mask. Returns the
-    mask and an indicator of if the mask has been modified.
-    """
-    import cv2  # type: ignore
-
-    assert mode in ["holes", "islands"]
-    correct_holes = mode == "holes"
-    working_mask = (correct_holes ^ mask).astype(np.uint8)
-    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-    sizes = stats[:, -1][1:]  # Row 0 is background label
-    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-    if len(small_regions) == 0:
-        return mask, False
-    fill_labels = [0] + small_regions
-    if not correct_holes:
-        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-        # If every region is below threshold, keep largest
-        if len(fill_labels) == 0:
-            fill_labels = [int(np.argmax(sizes)) + 1]
-    mask = np.isin(regions, fill_labels)
-    return mask, True
\ No newline at end of file
diff --git a/mm_agents/task_adapter/semantic_sam/tasks/interactive_predictor.py b/mm_agents/task_adapter/semantic_sam/tasks/interactive_predictor.py
deleted file mode 100644
index 70252a3..0000000
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_predictor.py
+++ /dev/null
@@ -1,139 +0,0 @@
-import torch
-import numpy as np
-from torchvision import transforms
-from task_adapter.utils.visualizer import Visualizer
-from typing import Tuple
-from PIL import Image
-from detectron2.data import MetadataCatalog
-metadata = MetadataCatalog.get('coco_2017_train_panoptic')
-
-
-class SemanticSAMPredictor:
-    def __init__(self, model, thresh=0.5, text_size=640, hole_scale=100, island_scale=100):
-        """
-        thresh: iou thresh to filter low confidence objects
-        text_size: resize the input image short edge for the model to process
-        hole_scale: fill in small holes as in SAM
-        island_scale: remove small regions as in SAM
-        """
-        self.model = model
-        self.thresh = thresh
-        self.text_size = hole_scale
-        self.hole_scale = hole_scale
-        self.island_scale = island_scale
-        self.point = None
-
-    def predict(self, image_ori, image, point=None):
-        """
-        produce up to 6 prediction results for each click
-        """
-        width = image_ori.shape[0]
-        height = image_ori.shape[1]
-
-        data = {"image": image, "height": height, "width": width}
-        # import ipdb; ipdb.set_trace()
-        if point is None:
-            point = torch.tensor([[0.5, 0.5, 0.006, 0.006]]).cuda()
-        else:
-            point = torch.tensor(point).cuda()
-            point_ = point
-            point = point_.clone()
-            point[0, 0] = point_[0, 0]
-            point[0, 1] = point_[0, 1]
-            # point = point[:, [1, 0]]
-            point = torch.cat([point, point.new_tensor([[0.005, 0.005]])], dim=-1)
-
-        self.point = point[:, :2].clone()*(torch.tensor([width, height]).to(point))
-
-        data['targets'] = [dict()]
-        data['targets'][0]['points'] = point
-        data['targets'][0]['pb'] = point.new_tensor([0.])
-
-        batch_inputs = [data]
-        masks, ious = self.model.model.evaluate_demo(batch_inputs)
-
-        return masks, ious
-
-    def process_multi_mask(self, masks, ious, image_ori):
-        pred_masks_poses = masks
-        reses = []
-        ious = ious[0, 0]
-        ids = torch.argsort(ious, descending=True)
-
-        text_res = ''
-        mask_ls = []
-        ious_res = []
-        areas = []
-        for i, (pred_masks_pos, iou) in enumerate(zip(pred_masks_poses[ids], ious[ids])):
-            iou = round(float(iou), 2)
-            texts = f'{iou}'
-            mask = (pred_masks_pos > 0.0).cpu().numpy()
-            area = mask.sum()
-            conti = False
-            if iou < self.thresh:
-                conti = True
-            for m in mask_ls:
-                if np.logical_and(mask, m).sum() / np.logical_or(mask, m).sum() > 0.95:
-                    conti = True
-                    break
-            if i == len(pred_masks_poses[ids]) - 1 and mask_ls == []:
-                conti = False
-            if conti:
-                continue
-            ious_res.append(iou)
-            mask_ls.append(mask)
-            areas.append(area)
-            mask, _ = self.remove_small_regions(mask, int(self.hole_scale), mode="holes")
-            mask, _ = self.remove_small_regions(mask, int(self.island_scale), mode="islands")
-            mask = (mask).astype(np.float)
-            out_txt = texts
-            visual = Visualizer(image_ori, metadata=metadata)
-            color = [0., 0., 1.0]
-            demo = visual.draw_binary_mask(mask, color=color, text=texts)
-            res = demo.get_image()
-            point_x0 = max(0, int(self.point[0, 0]) - 3)
-            point_x1 = min(image_ori.shape[1], int(self.point[0, 0]) + 3)
-            point_y0 = max(0, int(self.point[0, 1]) - 3)
-            point_y1 = min(image_ori.shape[0], int(self.point[0, 1]) + 3)
-            res[point_y0:point_y1, point_x0:point_x1, 0] = 255
-            res[point_y0:point_y1, point_x0:point_x1, 1] = 0
-            res[point_y0:point_y1, point_x0:point_x1, 2] = 0
-            reses.append(Image.fromarray(res))
-            text_res = text_res + ';' + out_txt
-        ids = list(torch.argsort(torch.tensor(areas), descending=False))
-        ids = [int(i) for i in ids]
-
-        torch.cuda.empty_cache()
-
-        return reses, [reses[i] for i in ids]
-
-    def predict_masks(self, image_ori, image, point=None):
-        masks, ious = self.predict(image_ori, image, point)
-        return self.process_multi_mask(masks, ious, image_ori)
-
-    @staticmethod
-    def remove_small_regions(
-            mask: np.ndarray, area_thresh: float, mode: str
-    ) -> Tuple[np.ndarray, bool]:
-        """
-        Removes small disconnected regions and holes in a mask. Returns the
-        mask and an indicator of if the mask has been modified.
-        """
-        import cv2  # type: ignore
-
-        assert mode in ["holes", "islands"]
-        correct_holes = mode == "holes"
-        working_mask = (correct_holes ^ mask).astype(np.uint8)
-        n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
-        sizes = stats[:, -1][1:]  # Row 0 is background label
-        small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
-        if len(small_regions) == 0:
-            return mask, False
-        fill_labels = [0] + small_regions
-        if not correct_holes:
-            fill_labels = [i for i in range(n_labels) if i not in fill_labels]
-            # If every region is below threshold, keep largest
-            if len(fill_labels) == 0:
-                fill_labels = [int(np.argmax(sizes)) + 1]
-        mask = np.isin(regions, fill_labels)
-        return mask, True
diff --git a/mm_agents/task_adapter/utils/visualizer.py b/mm_agents/task_adapter/utils/visualizer.py
deleted file mode 100644
index bd78a98..0000000
--- a/mm_agents/task_adapter/utils/visualizer.py
+++ /dev/null
@@ -1,1405 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-import colorsys
-import logging
-import math
-import numpy as np
-from enum import Enum, unique
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import pycocotools.mask as mask_util
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-from PIL import Image
-
-from detectron2.data import MetadataCatalog
-from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
-from detectron2.utils.file_io import PathManager
-
-from detectron2.utils.colormap import random_color
-import random
-
-logger = logging.getLogger(__name__)
-
-__all__ = ["ColorMode", "VisImage", "Visualizer"]
-
-
-_SMALL_OBJECT_AREA_THRESH = 1000
-_LARGE_MASK_AREA_THRESH = 120000
-_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
-_BLACK = (0, 0, 0)
-_RED = (1.0, 0, 0)
-
-_KEYPOINT_THRESHOLD = 0.05
-
-
-@unique
-class ColorMode(Enum):
-    """
-    Enum of different color modes to use for instance visualizations.
-    """
-
-    IMAGE = 0
-    """
-    Picks a random color for every instance and overlay segmentations with low opacity.
-    """
-    SEGMENTATION = 1
-    """
-    Let instances of the same category have similar colors
-    (from metadata.thing_colors), and overlay them with
-    high opacity. This provides more attention on the quality of segmentation.
-    """
-    IMAGE_BW = 2
-    """
-    Same as IMAGE, but convert all areas without masks to gray-scale.
-    Only available for drawing per-instance mask predictions.
-    """
-
-
-class GenericMask:
-    """
-    Attribute:
-        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
-            Each ndarray has format [x, y, x, y, ...]
-        mask (ndarray): a binary mask
-    """
-
-    def __init__(self, mask_or_polygons, height, width):
-        self._mask = self._polygons = self._has_holes = None
-        self.height = height
-        self.width = width
-
-        m = mask_or_polygons
-        if isinstance(m, dict):
-            # RLEs
-            assert "counts" in m and "size" in m
-            if isinstance(m["counts"], list):  # uncompressed RLEs
-                h, w = m["size"]
-                assert h == height and w == width
-                m = mask_util.frPyObjects(m, h, w)
-            self._mask = mask_util.decode(m)[:, :]
-            return
-
-        if isinstance(m, list):  # list[ndarray]
-            self._polygons = [np.asarray(x).reshape(-1) for x in m]
-            return
-
-        if isinstance(m, np.ndarray):  # assumed to be a binary mask
-            assert m.shape[1] != 2, m.shape
-            assert m.shape == (
-                height,
-                width,
-            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
-            self._mask = m.astype("uint8")
-            return
-
-        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
-
-    @property
-    def mask(self):
-        if self._mask is None:
-            self._mask = self.polygons_to_mask(self._polygons)
-        return self._mask
-
-    @property
-    def polygons(self):
-        if self._polygons is None:
-            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-        return self._polygons
-
-    @property
-    def has_holes(self):
-        if self._has_holes is None:
-            if self._mask is not None:
-                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-            else:
-                self._has_holes = False  # if original format is polygon, does not have holes
-        return self._has_holes
-
-    def mask_to_polygons(self, mask):
-        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
-        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
-        # Internal contours (holes) are placed in hierarchy-2.
-        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
-        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
-        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
-        hierarchy = res[-1]
-        if hierarchy is None:  # empty mask
-            return [], False
-        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
-        res = res[-2]
-        res = [x.flatten() for x in res]
-        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
-        # We add 0.5 to turn them into real-value coordinate space. A better solution
-        # would be to first +0.5 and then dilate the returned polygon by 0.5.
-        res = [x + 0.5 for x in res if len(x) >= 6]
-        return res, has_holes
-
-    def polygons_to_mask(self, polygons):
-        rle = mask_util.frPyObjects(polygons, self.height, self.width)
-        rle = mask_util.merge(rle)
-        return mask_util.decode(rle)[:, :]
-
-    def area(self):
-        return self.mask.sum()
-
-    def bbox(self):
-        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
-        p = mask_util.merge(p)
-        bbox = mask_util.toBbox(p)
-        bbox[2] += bbox[0]
-        bbox[3] += bbox[1]
-        return bbox
-
-
-class _PanopticPrediction:
-    """
-    Unify different panoptic annotation/prediction formats
-    """
-
-    def __init__(self, panoptic_seg, segments_info, metadata=None):
-        if segments_info is None:
-            assert metadata is not None
-            # If "segments_info" is None, we assume "panoptic_img" is a
-            # H*W int32 image storing the panoptic_id in the format of
-            # category_id * label_divisor + instance_id. We reserve -1 for
-            # VOID label.
-            label_divisor = metadata.label_divisor
-            segments_info = []
-            for panoptic_label in np.unique(panoptic_seg.numpy()):
-                if panoptic_label == -1:
-                    # VOID region.
-                    continue
-                pred_class = panoptic_label // label_divisor
-                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
-                segments_info.append(
-                    {
-                        "id": int(panoptic_label),
-                        "category_id": int(pred_class),
-                        "isthing": bool(isthing),
-                    }
-                )
-        del metadata
-
-        self._seg = panoptic_seg
-
-        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
-        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
-        areas = areas.numpy()
-        sorted_idxs = np.argsort(-areas)
-        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
-        self._seg_ids = self._seg_ids.tolist()
-        for sid, area in zip(self._seg_ids, self._seg_areas):
-            if sid in self._sinfo:
-                self._sinfo[sid]["area"] = float(area)
-
-    def non_empty_mask(self):
-        """
-        Returns:
-            (H, W) array, a mask for all pixels that have a prediction
-        """
-        empty_ids = []
-        for id in self._seg_ids:
-            if id not in self._sinfo:
-                empty_ids.append(id)
-        if len(empty_ids) == 0:
-            return np.zeros(self._seg.shape, dtype=np.uint8)
-        assert (
-            len(empty_ids) == 1
-        ), ">1 ids corresponds to no labels. This is currently not supported"
-        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
-
-    def semantic_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or sinfo["isthing"]:
-                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
-                continue
-            yield (self._seg == sid).numpy().astype(np.bool), sinfo
-
-    def instance_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or not sinfo["isthing"]:
-                continue
-            mask = (self._seg == sid).numpy().astype(np.bool)
-            if mask.sum() > 0:
-                yield mask, sinfo
-
-
-def _create_text_labels(classes, scores, class_names, is_crowd=None):
-    """
-    Args:
-        classes (list[int] or None):
-        scores (list[float] or None):
-        class_names (list[str] or None):
-        is_crowd (list[bool] or None):
-
-    Returns:
-        list[str] or None
-    """
-    labels = None
-    if classes is not None:
-        if class_names is not None and len(class_names) > 0:
-            labels = [class_names[i] for i in classes]
-        else:
-            labels = [str(i) for i in classes]
-    if scores is not None:
-        if labels is None:
-            labels = ["{:.0f}%".format(s * 100) for s in scores]
-        else:
-            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
-    if labels is not None and is_crowd is not None:
-        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
-    return labels
-
-
-class VisImage:
-    def __init__(self, img, scale=1.0):
-        """
-        Args:
-            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
-            scale (float): scale the input image
-        """
-        self.img = img
-        self.scale = scale
-        self.width, self.height = img.shape[1], img.shape[0]
-        self._setup_figure(img)
-
-    def _setup_figure(self, img):
-        """
-        Args:
-            Same as in :meth:`__init__()`.
-
-        Returns:
-            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
-            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
-        """
-        fig = mplfigure.Figure(frameon=False)
-        self.dpi = fig.get_dpi()
-        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
-        # (https://github.com/matplotlib/matplotlib/issues/15363)
-        fig.set_size_inches(
-            (self.width * self.scale + 1e-2) / self.dpi,
-            (self.height * self.scale + 1e-2) / self.dpi,
-        )
-        self.canvas = FigureCanvasAgg(fig)
-        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        self.fig = fig
-        self.ax = ax
-        self.reset_image(img)
-
-    def reset_image(self, img):
-        """
-        Args:
-            img: same as in __init__
-        """
-        img = img.astype("uint8")
-        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
-
-    def save(self, filepath):
-        """
-        Args:
-            filepath (str): a string that contains the absolute path, including the file name, where
-                the visualized image will be saved.
-        """
-        self.fig.savefig(filepath)
-
-    def get_image(self):
-        """
-        Returns:
-            ndarray:
-                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
-                The shape is scaled w.r.t the input image using the given `scale` argument.
-        """
-        canvas = self.canvas
-        s, (width, height) = canvas.print_to_buffer()
-        # buf = io.BytesIO()  # works for cairo backend
-        # canvas.print_rgba(buf)
-        # width, height = self.width, self.height
-        # s = buf.getvalue()
-
-        buffer = np.frombuffer(s, dtype="uint8")
-
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-        return rgb.astype("uint8")
-
-
-class Visualizer:
-    """
-    Visualizer that draws data about detection/segmentation on images.
-
-    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
-    that draw primitive objects to images, as well as high-level wrappers like
-    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
-    that draw composite data in some pre-defined style.
-
-    Note that the exact visualization style for the high-level wrappers are subject to change.
-    Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according
-    to different heuristics, as long as the results still look visually reasonable.
-
-    To obtain a consistent style, you can implement custom drawing functions with the
-    abovementioned primitive methods instead. If you need more customized visualization
-    styles, you can process the data yourself following their format documented in
-    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
-    intend to satisfy everyone's preference on drawing styles.
-
-    This visualizer focuses on high rendering quality rather than performance. It is not
-    designed to be used for real-time applications.
-    """
-
-    # TODO implement a fast, rasterized version using OpenCV
-
-    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
-        """
-        Args:
-            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-                the height and width of the image respectively. C is the number of
-                color channels. The image is required to be in RGB format since that
-                is a requirement of the Matplotlib library. The image is also expected
-                to be in the range [0, 255].
-            metadata (Metadata): dataset metadata (e.g. class names and colors)
-            instance_mode (ColorMode): defines one of the pre-defined style for drawing
-                instances on an image.
-        """
-        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        if metadata is None:
-            metadata = MetadataCatalog.get("__nonexist__")
-        self.metadata = metadata
-        self.output = VisImage(self.img, scale=scale)
-        self.cpu_device = torch.device("cpu")
-
-        # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(
-            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
-        )
-        self._default_font_size = 18
-        self._instance_mode = instance_mode
-        self.keypoint_threshold = _KEYPOINT_THRESHOLD
-
-        import matplotlib.colors as mcolors
-        css4_colors = mcolors.CSS4_COLORS
-        self.color_proposals = [list(mcolors.hex2color(color)) for color in css4_colors.values()]
-
-    def draw_instance_predictions(self, predictions):
-        """
-        Draw instance-level prediction results on an image.
-
-        Args:
-            predictions (Instances): the output of an instance detection/segmentation
-                model. Following fields will be used to draw:
-                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
-        scores = predictions.scores if predictions.has("scores") else None
-        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
-        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
-        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
-
-        keep = (scores > 0.5).cpu()
-        boxes = boxes[keep]
-        scores = scores[keep]
-        classes = np.array(classes)
-        classes = classes[np.array(keep)]
-        labels = np.array(labels)
-        labels = labels[np.array(keep)]
-
-        if predictions.has("pred_masks"):
-            masks = np.asarray(predictions.pred_masks)
-            masks = masks[np.array(keep)]
-            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
-        else:
-            masks = None
-
-        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-        # if self.metadata.get("thing_colors"):
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
-            ]
-            alpha = 0.4
-        else:
-            colors = None
-            alpha = 0.4
-
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(
-                self._create_grayscale_image(
-                    (predictions.pred_masks.any(dim=0) > 0).numpy()
-                    if predictions.has("pred_masks")
-                    else None
-                )
-            )
-            alpha = 0.3
-        
-        self.overlay_instances(
-            masks=masks,
-            boxes=boxes,
-            labels=labels,
-            keypoints=keypoints,
-            assigned_colors=colors,
-            alpha=alpha,
-        )
-        return self.output
-
-    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.7):
-        """
-        Draw semantic segmentation predictions/labels.
-
-        Args:
-            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
-                Each value is the integer label of the pixel.
-            area_threshold (int): segments with less than `area_threshold` are not drawn.
-            alpha (float): the larger it is, the more opaque the segmentations are.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        if isinstance(sem_seg, torch.Tensor):
-            sem_seg = sem_seg.numpy()
-        labels, areas = np.unique(sem_seg, return_counts=True)
-        sorted_idxs = np.argsort(-areas).tolist()
-        labels = labels[sorted_idxs]
-        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
-            except (AttributeError, IndexError):
-                mask_color = None
-
-            binary_mask = (sem_seg == label).astype(np.uint8)
-            text = self.metadata.stuff_classes[label]
-            self.draw_binary_mask(
-                binary_mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        return self.output
-
-    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
-        """
-        Draw panoptic prediction annotations or results.
-
-        Args:
-            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
-                segment.
-            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
-                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
-                If None, category id of each pixel is computed by
-                ``pixel // metadata.label_divisor``.
-            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
-
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
-
-        # draw mask for all semantic segments first i.e. "stuff"
-        for mask, sinfo in pred.semantic_masks():
-            category_idx = sinfo["category_id"]
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
-            except AttributeError:
-                mask_color = None
-
-            text = self.metadata.stuff_classes[category_idx].replace('-other','').replace('-merged','')
-            self.draw_binary_mask(
-                mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-
-        # draw mask for all instances second
-        all_instances = list(pred.instance_masks())
-        if len(all_instances) == 0:
-            return self.output
-        masks, sinfo = list(zip(*all_instances))
-        category_ids = [x["category_id"] for x in sinfo]
-
-        try:
-            scores = [x["score"] for x in sinfo]
-        except KeyError:
-            scores = None
-        class_names = [name.replace('-other','').replace('-merged','') for name in self.metadata.thing_classes]
-        labels = _create_text_labels(
-            category_ids, scores, class_names, [x.get("iscrowd", 0) for x in sinfo]
-        )
-
-        try:
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
-            ]
-        except AttributeError:
-            colors = None
-        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
-
-        return self.output
-
-    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
-
-    def draw_dataset_dict(self, dic):
-        """
-        Draw annotations/segmentaions in Detectron2 Dataset format.
-
-        Args:
-            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        annos = dic.get("annotations", None)
-        if annos:
-            if "segmentation" in annos[0]:
-                masks = [x["segmentation"] for x in annos]
-            else:
-                masks = None
-            if "keypoints" in annos[0]:
-                keypts = [x["keypoints"] for x in annos]
-                keypts = np.array(keypts).reshape(len(annos), -1, 3)
-            else:
-                keypts = None
-
-            boxes = [
-                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
-                if len(x["bbox"]) == 4
-                else x["bbox"]
-                for x in annos
-            ]
-
-            colors = None
-            category_ids = [x["category_id"] for x in annos]
-            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-                colors = [
-                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
-                    for c in category_ids
-                ]
-            names = self.metadata.get("thing_classes", None)
-            labels = _create_text_labels(
-                category_ids,
-                scores=None,
-                class_names=names,
-                is_crowd=[x.get("iscrowd", 0) for x in annos],
-            )
-            self.overlay_instances(
-                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
-            )
-
-        sem_seg = dic.get("sem_seg", None)
-        if sem_seg is None and "sem_seg_file_name" in dic:
-            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
-                sem_seg = Image.open(f)
-                sem_seg = np.asarray(sem_seg, dtype="uint8")
-        if sem_seg is not None:
-            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.4)
-
-        pan_seg = dic.get("pan_seg", None)
-        if pan_seg is None and "pan_seg_file_name" in dic:
-            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
-                pan_seg = Image.open(f)
-                pan_seg = np.asarray(pan_seg)
-                from panopticapi.utils import rgb2id
-
-                pan_seg = rgb2id(pan_seg)
-        if pan_seg is not None:
-            segments_info = dic["segments_info"]
-            pan_seg = torch.tensor(pan_seg)
-            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.7)
-        return self.output
-
-    def overlay_instances(
-        self,
-        *,
-        boxes=None,
-        labels=None,
-        masks=None,
-        keypoints=None,
-        assigned_colors=None,
-        alpha=0.5,
-    ):
-        """
-        Args:
-            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
-                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
-                or a :class:`RotatedBoxes`,
-                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image,
-            labels (list[str]): the text to be displayed for each instance.
-            masks (masks-like object): Supported types are:
-
-                * :class:`detectron2.structures.PolygonMasks`,
-                  :class:`detectron2.structures.BitMasks`.
-                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
-                  The first level of the list corresponds to individual instances. The second
-                  level to all the polygon that compose the instance, and the third level
-                  to the polygon coordinates. The third level should have the format of
-                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
-                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
-                * list[dict]: each dict is a COCO-style RLE.
-            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
-                where the N is the number of instances and K is the number of keypoints.
-                The last dimension corresponds to (x, y, visibility or score).
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = 0
-        if boxes is not None:
-            boxes = self._convert_boxes(boxes)
-            num_instances = len(boxes)
-        if masks is not None:
-            masks = self._convert_masks(masks)
-            if num_instances:
-                assert len(masks) == num_instances
-            else:
-                num_instances = len(masks)
-        if keypoints is not None:
-            if num_instances:
-                assert len(keypoints) == num_instances
-            else:
-                num_instances = len(keypoints)
-            keypoints = self._convert_keypoints(keypoints)
-        if labels is not None:
-            assert len(labels) == num_instances
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        if boxes is not None and boxes.shape[1] == 5:
-            return self.overlay_rotated_instances(
-                boxes=boxes, labels=labels, assigned_colors=assigned_colors
-            )
-
-        # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        elif masks is not None:
-            areas = np.asarray([x.area() for x in masks])
-
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
-
-        for i in range(num_instances):
-            color = assigned_colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-
-            if masks is not None:
-                for segment in masks[i].polygons:
-                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
-
-            if labels is not None:
-                # first get a box
-                if boxes is not None:
-                    x0, y0, x1, y1 = boxes[i]
-                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                    horiz_align = "left"
-                elif masks is not None:
-                    # skip small mask without polygon
-                    if len(masks[i].polygons) == 0:
-                        continue
-
-                    x0, y0, x1, y1 = masks[i].bbox()
-
-                    # draw text in the center (defined by median) when box is not drawn
-                    # median is less sensitive to outliers.
-                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
-                    horiz_align = "center"
-                else:
-                    continue  # drawing the box confidence for keypoints isn't very useful.
-                # for small objects, draw text at the side to avoid occlusion
-                instance_area = (y1 - y0) * (x1 - x0)
-                if (
-                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
-                    or y1 - y0 < 40 * self.output.scale
-                ):
-                    if y1 >= self.output.height - 5:
-                        text_pos = (x1, y0)
-                    else:
-                        text_pos = (x0, y1)
-
-                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                font_size = (
-                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-                    * 0.5
-                    * self._default_font_size
-                )
-                self.draw_text(
-                    labels[i],
-                    text_pos,
-                    color=lighter_color,
-                    horizontal_alignment=horiz_align,
-                    font_size=font_size,
-                )
-
-        # draw keypoints
-        if keypoints is not None:
-            for keypoints_per_instance in keypoints:
-                self.draw_and_connect_keypoints(keypoints_per_instance)
-
-        return self.output
-
-    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
-        """
-        Args:
-            boxes (ndarray): an Nx5 numpy array of
-                (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image.
-            labels (list[str]): the text to be displayed for each instance.
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = len(boxes)
-
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-
-        # Display in largest to smallest order to reduce occlusion.
-        if boxes is not None:
-            areas = boxes[:, 2] * boxes[:, 3]
-
-        sorted_idxs = np.argsort(-areas).tolist()
-        # Re-order overlapped instances in descending order.
-        boxes = boxes[sorted_idxs]
-        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-        colors = [assigned_colors[idx] for idx in sorted_idxs]
-
-        for i in range(num_instances):
-            self.draw_rotated_box_with_label(
-                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
-            )
-
-        return self.output
-
-    def draw_and_connect_keypoints(self, keypoints):
-        """
-        Draws keypoints of an instance and follows the rules for keypoint connections
-        to draw lines between appropriate keypoints. This follows color heuristics for
-        line color.
-
-        Args:
-            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
-                and the last dimension corresponds to (x, y, probability).
-
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        visible = {}
-        keypoint_names = self.metadata.get("keypoint_names")
-        for idx, keypoint in enumerate(keypoints):
-
-            # draw keypoint
-            x, y, prob = keypoint
-            if prob > self.keypoint_threshold:
-                self.draw_circle((x, y), color=_RED)
-                if keypoint_names:
-                    keypoint_name = keypoint_names[idx]
-                    visible[keypoint_name] = (x, y)
-
-        if self.metadata.get("keypoint_connection_rules"):
-            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
-                if kp0 in visible and kp1 in visible:
-                    x0, y0 = visible[kp0]
-                    x1, y1 = visible[kp1]
-                    color = tuple(x / 255.0 for x in color)
-                    self.draw_line([x0, x1], [y0, y1], color=color)
-
-        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
-        # Note that this strategy is specific to person keypoints.
-        # For other keypoints, it should just do nothing
-        try:
-            ls_x, ls_y = visible["left_shoulder"]
-            rs_x, rs_y = visible["right_shoulder"]
-            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
-        except KeyError:
-            pass
-        else:
-            # draw line from nose to mid-shoulder
-            nose_x, nose_y = visible.get("nose", (None, None))
-            if nose_x is not None:
-                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
-
-            try:
-                # draw line from mid-shoulder to mid-hip
-                lh_x, lh_y = visible["left_hip"]
-                rh_x, rh_y = visible["right_hip"]
-            except KeyError:
-                pass
-            else:
-                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
-                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
-        return self.output
-
-    """
-    Primitive drawing functions:
-    """
-
-    def draw_text(
-        self,
-        text,
-        position,
-        *,
-        font_size=None,
-        color="g",
-        horizontal_alignment="center",
-        rotation=0,
-    ):
-        """
-        Args:
-            text (str): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
-            font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
-            horizontal_alignment (str): see `matplotlib.text.Text`
-            rotation: rotation angle in degrees CCW
-
-        Returns:
-            output (VisImage): image object with text drawn.
-        """
-        if not font_size:
-            font_size = self._default_font_size
-
-        # since the text background is dark, we don't want the text to be dark
-        color = np.maximum(list(mplc.to_rgb(color)), 0.15)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-
-        def contrasting_color(rgb):
-            """Returns 'white' or 'black' depending on which color contrasts more with the given RGB value."""
-            
-            # Decompose the RGB tuple
-            R, G, B = rgb
-
-            # Calculate the Y value
-            Y = 0.299 * R + 0.587 * G + 0.114 * B
-
-            # If Y value is greater than 128, it's closer to white so return black. Otherwise, return white.
-            return 'black' if Y > 128 else 'white'
-
-        bbox_background = contrasting_color(color*255)
-
-        x, y = position
-        self.output.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.output.scale,
-            family="sans-serif",
-            bbox={"facecolor": bbox_background, "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
-            verticalalignment="top",
-            horizontalalignment=horizontal_alignment,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-        return self.output
-
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
-        """
-        Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x0, y0, x1, y1 = box_coord
-        width = x1 - x0
-        height = y1 - y0
-
-        linewidth = max(self._default_font_size / 12, 1)
-
-        self.output.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=edge_color,
-                linewidth=linewidth * self.output.scale,
-                alpha=alpha,
-                linestyle=line_style,
-            )
-        )
-        return self.output
-
-    def draw_rotated_box_with_label(
-        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
-    ):
-        """
-        Draw a rotated box with label on its top-left corner.
-
-        Args:
-            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
-                where cnt_x and cnt_y are the center coordinates of the box.
-                w and h are the width and height of the box. angle represents how
-                many degrees the box is rotated CCW with regard to the 0-degree box.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-            label (string): label for rotated box. It will not be rendered when set to None.
-
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        cnt_x, cnt_y, w, h, angle = rotated_box
-        area = w * h
-        # use thinner lines when the box is small
-        linewidth = self._default_font_size / (
-            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
-        )
-
-        theta = angle * math.pi / 180.0
-        c = math.cos(theta)
-        s = math.sin(theta)
-        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
-        # x: left->right ; y: top->down
-        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
-        for k in range(4):
-            j = (k + 1) % 4
-            self.draw_line(
-                [rotated_rect[k][0], rotated_rect[j][0]],
-                [rotated_rect[k][1], rotated_rect[j][1]],
-                color=edge_color,
-                linestyle="--" if k == 1 else line_style,
-                linewidth=linewidth,
-            )
-
-        if label is not None:
-            text_pos = rotated_rect[1]  # topleft corner
-
-            height_ratio = h / np.sqrt(self.output.height * self.output.width)
-            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
-            font_size = (
-                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
-            )
-            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
-
-        return self.output
-
-    def draw_circle(self, circle_coord, color, radius=3):
-        """
-        Args:
-            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
-                of the center of the circle.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            radius (int): radius of the circle.
-
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x, y = circle_coord
-        self.output.ax.add_patch(
-            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
-        )
-        return self.output
-
-    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
-        """
-        Args:
-            x_data (list[int]): a list containing x values of all the points being drawn.
-                Length of list should match the length of y_data.
-            y_data (list[int]): a list containing y values of all the points being drawn.
-                Length of list should match the length of x_data.
-            color: color of the line. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
-                for a full list of formats that are accepted.
-            linewidth (float or None): width of the line. When it's None,
-                a default value will be computed and used.
-
-        Returns:
-            output (VisImage): image object with line drawn.
-        """
-        if linewidth is None:
-            linewidth = self._default_font_size / 3
-        linewidth = max(linewidth, 1)
-        self.output.ax.add_line(
-            mpl.lines.Line2D(
-                x_data,
-                y_data,
-                linewidth=linewidth * self.output.scale,
-                color=color,
-                linestyle=linestyle,
-            )
-        )
-        return self.output
-
-    def draw_binary_mask(
-        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.7, area_threshold=10
-    ):
-        """
-        Args:
-            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
-                W is the image width. Each value in the array is either a 0 or 1 value of uint8
-                type.
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted.
-            text (str): if None, will be drawn on the object
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            area_threshold (float): a connected component smaller than this area will not be shown.
-
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-
-        has_valid_segment = False
-        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
-        mask = GenericMask(binary_mask, self.output.height, self.output.width)
-        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
-
-        if not mask.has_holes:
-            # draw polygons for regular masks
-            for segment in mask.polygons:
-                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
-                if area < (area_threshold or 0):
-                    continue
-                has_valid_segment = True
-                segment = segment.reshape(-1, 2)
-                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
-        else:
-            # TODO: Use Path/PathPatch to draw vector graphics:
-            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
-            rgba = np.zeros(shape2d + (4,), dtype="float32")
-            rgba[:, :, :3] = color
-            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
-            has_valid_segment = True
-            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-
-        if text is not None and has_valid_segment:
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            self._draw_text_in_mask(binary_mask, text, lighter_color)
-        return self.output
-    
-    def draw_binary_mask_with_number(
-        self, binary_mask, color=None, *, edge_color=None, text=None, label_mode='1', alpha=0.1, anno_mode=['Mask'], area_threshold=10
-    ):
-        """
-        Args:
-            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
-                W is the image width. Each value in the array is either a 0 or 1 value of uint8
-                type.
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted.
-            text (str): if None, will be drawn on the object
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            area_threshold (float): a connected component smaller than this area will not be shown.
-
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            randint = random.randint(0, len(self.color_proposals)-1)
-            color = self.color_proposals[randint]
-        color = mplc.to_rgb(color)
-
-        has_valid_segment = True
-        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
-        mask = GenericMask(binary_mask, self.output.height, self.output.width)
-        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
-        bbox = mask.bbox()
-
-        if 'Mask' in anno_mode:
-            if not mask.has_holes:
-                # draw polygons for regular masks
-                for segment in mask.polygons:
-                    area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
-                    if area < (area_threshold or 0):
-                        continue
-                    has_valid_segment = True
-                    segment = segment.reshape(-1, 2)
-                    self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
-            else:
-                # TODO: Use Path/PathPatch to draw vector graphics:
-                # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
-                rgba = np.zeros(shape2d + (4,), dtype="float32")
-                rgba[:, :, :3] = color
-                rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
-                has_valid_segment = True
-                self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-
-        if 'Box' in anno_mode:
-            self.draw_box(bbox, edge_color=color, alpha=0.75)
-
-        if 'Mark' in anno_mode:
-            has_valid_segment = True
-        else:
-            has_valid_segment = False
-
-        if text is not None and has_valid_segment:
-            # lighter_color = tuple([x*0.2 for x in color])
-            lighter_color = [1,1,1] # self._change_color_brightness(color, brightness_factor=0.7)
-            self._draw_number_in_mask(binary_mask, text, lighter_color, label_mode)
-        return self.output
-
-    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
-        """
-        Args:
-            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            text (str): if None, will be drawn on the object
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-
-        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
-        rgba = np.zeros(shape2d + (4,), dtype="float32")
-        rgba[:, :, :3] = color
-        rgba[:, :, 3] = soft_mask * alpha
-        self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-
-        if text is not None:
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            binary_mask = (soft_mask > 0.5).astype("uint8")
-            self._draw_text_in_mask(binary_mask, text, lighter_color)
-        return self.output
-
-    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
-        """
-        Args:
-            segment: numpy array of shape Nx2, containing all the points in the polygon.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted. If not provided, a darker shade
-                of the polygon color will be used instead.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-
-        Returns:
-            output (VisImage): image object with polygon drawn.
-        """
-        if edge_color is None:
-            # make edge color darker than the polygon color
-            if alpha > 0.8:
-                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
-            else:
-                edge_color = color
-        edge_color = mplc.to_rgb(edge_color) + (1,)
-
-        polygon = mpl.patches.Polygon(
-            segment,
-            fill=True,
-            facecolor=mplc.to_rgb(color) + (alpha,),
-            edgecolor=edge_color,
-            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
-        )
-        self.output.ax.add_patch(polygon)
-        return self.output
-
-    """
-    Internal methods:
-    """
-
-    def _jitter(self, color):
-        """
-        Randomly modifies given color to produce a slightly different color than the color given.
-
-        Args:
-            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
-                picked. The values in the list are in the [0.0, 1.0] range.
-
-        Returns:
-            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
-                color after being jittered. The values in the list are in the [0.0, 1.0] range.
-        """
-        color = mplc.to_rgb(color)
-        # np.random.seed(0)
-        vec = np.random.rand(3)
-        # better to do it in another color space
-        vec = vec / np.linalg.norm(vec) * 0.5
-        res = np.clip(vec + color, 0, 1)
-        return tuple(res)
-
-    def _create_grayscale_image(self, mask=None):
-        """
-        Create a grayscale version of the original image.
-        The colors in masked area, if given, will be kept.
-        """
-        img_bw = self.img.astype("f4").mean(axis=2)
-        img_bw = np.stack([img_bw] * 3, axis=2)
-        if mask is not None:
-            img_bw[mask] = self.img[mask]
-        return img_bw
-
-    def _change_color_brightness(self, color, brightness_factor):
-        """
-        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
-        less or more saturation than the original color.
-
-        Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
-                0 will correspond to no change, a factor in [-1.0, 0) range will result in
-                a darker color and a factor in (0, 1.0] range will result in a lighter color.
-
-        Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
-        """
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-
-    def _convert_boxes(self, boxes):
-        """
-        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
-        """
-        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
-            return boxes.tensor.detach().numpy()
-        else:
-            return np.asarray(boxes)
-
-    def _convert_masks(self, masks_or_polygons):
-        """
-        Convert different format of masks or polygons to a tuple of masks and polygons.
-
-        Returns:
-            list[GenericMask]:
-        """
-
-        m = masks_or_polygons
-        if isinstance(m, PolygonMasks):
-            m = m.polygons
-        if isinstance(m, BitMasks):
-            m = m.tensor.numpy()
-        if isinstance(m, torch.Tensor):
-            m = m.numpy()
-        ret = []
-        for x in m:
-            if isinstance(x, GenericMask):
-                ret.append(x)
-            else:
-                ret.append(GenericMask(x, self.output.height, self.output.width))
-        return ret
-
-    def _draw_number_in_mask(self, binary_mask, text, color, label_mode='1'):
-        """
-        Find proper places to draw text given a binary mask.
-        """
-
-        def number_to_string(n):
-            chars = []
-            while n:
-                n, remainder = divmod(n-1, 26)
-                chars.append(chr(97 + remainder))
-            return ''.join(reversed(chars))
-
-        binary_mask = np.pad(binary_mask, ((1, 1), (1, 1)), 'constant')
-        mask_dt = cv2.distanceTransform(binary_mask, cv2.DIST_L2, 0)
-        mask_dt = mask_dt[1:-1, 1:-1]
-        max_dist = np.max(mask_dt)
-        coords_y, coords_x = np.where(mask_dt == max_dist)  # coords is [y, x]
-
-        if label_mode == 'a':
-            text = number_to_string(int(text))
-        else:
-            text = text
-
-        self.draw_text(text, (coords_x[len(coords_x)//2] + 2, coords_y[len(coords_y)//2] - 6), color=color)
-
-        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
-        # _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
-        # if stats[1:, -1].size == 0:
-        #     return
-        # largest_component_id = np.argmax(stats[1:, -1]) + 1
-
-        # # draw text on the largest component, as well as other very large components.
-        # for cid in range(1, _num_cc):
-        #     if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
-        #         # median is more stable than centroid
-        #         # center = centroids[largest_component_id]
-        #         center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
-        #         # bottom=np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
-        #         # center[1]=bottom[1]+2
-        #         self.draw_text(text, center, color=color)
-    
-    def _draw_text_in_mask(self, binary_mask, text, color):
-        """
-        Find proper places to draw text given a binary mask.
-        """
-        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
-        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
-        if stats[1:, -1].size == 0:
-            return
-        largest_component_id = np.argmax(stats[1:, -1]) + 1
-
-        # draw text on the largest component, as well as other very large components.
-        for cid in range(1, _num_cc):
-            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
-                # median is more stable than centroid
-                # center = centroids[largest_component_id]
-                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
-                bottom=np.max((cc_labels == cid).nonzero(), axis=1)[::-1]
-                center[1]=bottom[1]+2
-                self.draw_text(text, center, color=color)
-
-    def _convert_keypoints(self, keypoints):
-        if isinstance(keypoints, Keypoints):
-            keypoints = keypoints.tensor
-        keypoints = np.asarray(keypoints)
-        return keypoints
-
-    def get_output(self):
-        """
-        Returns:
-            output (VisImage): the image output containing the visualizations added
-            to the image.
-        """
-        return self.output
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index 2c595b9..9faae48 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -48,4 +48,5 @@ easyocr
 borb
 pypdf2
 pdfplumber
-
+wandb
+wrapt_timeout_decorator
diff --git a/run.py b/run.py
new file mode 100644
index 0000000..92e989a
--- /dev/null
+++ b/run.py
@@ -0,0 +1,288 @@
+"""Script to run end-to-end evaluation on the benchmark.
+Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
+"""
+import argparse
+import datetime
+import json
+import logging
+import os
+import random
+import sys
+# import wandb
+
+from tqdm import tqdm
+
+import lib_run_single
+from desktop_env.envs.desktop_env import DesktopEnv
+from mm_agents.agent import PromptAgent
+
+#  Logger Configs {{{ #
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
+debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
+stdout_handler = logging.StreamHandler(sys.stdout)
+sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(logging.INFO)
+sdebug_handler.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+sdebug_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+sdebug_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+logger.addHandler(sdebug_handler)
+#  }}} Logger Configs # 
+
+logger = logging.getLogger("desktopenv.experiment")
+
+# wandb config
+### set your wandb api key here
+# os.environ["WANDB_API_KEY"] = "48ec18fb4da7087238c6d6833eab9907565adbf3"
+# wandb.login(key=os.environ.get("WANDB_API_KEY", None))
+
+
+def config() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Run end-to-end evaluation on the benchmark"
+    )
+
+    # environment config
+    parser.add_argument("--path_to_vm", type=str,
+                        default=r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx")
+    parser.add_argument(
+        "--headless", action="store_true", help="Run in headless machine"
+    )
+    parser.add_argument("--action_space", type=str, default="pyautogui", help="Action type")
+    parser.add_argument(
+        "--observation_type",
+        choices=[
+            "screenshot",
+            "a11y_tree",
+            "screenshot_a11y_tree",
+            "som"
+        ],
+        default="a11y_tree",
+        help="Observation type",
+    )
+    parser.add_argument("--screen_width", type=int, default=1920)
+    parser.add_argument("--screen_height", type=int, default=1080)
+    parser.add_argument("--sleep_after_execution", type=float, default=0.0)
+    parser.add_argument("--max_steps", type=int, default=15)
+
+    # agent config
+    parser.add_argument("--max_trajectory_length", type=int, default=3)
+    parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples")
+
+    # lm config
+    parser.add_argument("--model", type=str, default="gpt-4-0125-preview")
+    parser.add_argument("--temperature", type=float, default=1.0)
+    parser.add_argument("--top_p", type=float, default=0.9)
+    parser.add_argument("--max_tokens", type=int, default=1500)
+    parser.add_argument("--stop_token", type=str, default=None)
+
+    # logging related
+    parser.add_argument("--result_dir", type=str, default="./results")
+    args = parser.parse_args()
+
+    return args
+
+
+def test(
+        args: argparse.Namespace,
+        test_all_meta: dict
+) -> None:
+    scores = []
+    max_steps = args.max_steps
+
+    # log args
+    logger.info("Args: %s", args)
+    # set wandb project
+    cfg_args = \
+    {
+        "path_to_vm": args.path_to_vm,
+        "headless": args.headless,
+        "action_space": args.action_space,
+        "observation_type": args.observation_type,
+        "screen_width": args.screen_width,
+        "screen_height": args.screen_height,
+        "sleep_after_execution": args.sleep_after_execution,
+        "max_steps": args.max_steps,
+        "max_trajectory_length": args.max_trajectory_length,
+        "model": args.model,
+        "temperature": args.temperature,
+        "top_p": args.top_p,
+        "max_tokens": args.max_tokens,
+        "stop_token": args.stop_token,
+        "result_dir": args.result_dir
+    }
+
+    agent = PromptAgent(
+        model=args.model,
+        max_tokens=args.max_tokens,
+        action_space=args.action_space,
+        observation_type=args.observation_type,
+        max_trajectory_length=args.max_trajectory_length,
+    )
+
+    env = DesktopEnv(
+        path_to_vm=args.path_to_vm,
+        action_space=agent.action_space,
+        screen_size=(args.screen_width, args.screen_height),
+        headless=args.headless,
+    )
+
+    for domain in tqdm(test_all_meta, desc="Domain"):
+        for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
+            # run = wandb.init(project=f"OSworld-{args.action_space}-{args.observation_type}-{args.model}", group=f"{domain}",
+            #         name=f"{example_id}")
+            # example setting
+            config_file = os.path.join(args.test_config_base_dir, f"examples/{domain}/{example_id}.json")
+            with open(config_file, "r", encoding="utf-8") as f:
+                example = json.load(f)
+
+            logger.info(f"[Domain]: {domain}")
+            logger.info(f"[Example ID]: {example_id}")
+
+            instruction = example["instruction"]
+
+            logger.info(f"[Instruction]: {instruction}")
+            # wandb each example config settings
+            cfg_args["instruction"] = instruction
+            cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S")
+            # run.config.update(cfg_args)
+
+            example_result_dir = os.path.join(
+                args.result_dir,
+                args.action_space,
+                args.observation_type,
+                args.model,
+                domain,
+                example_id
+            )
+            os.makedirs(example_result_dir, exist_ok=True)
+            # example start running
+            try:
+                lib_run_single.run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir,
+                                                  scores)
+            except Exception as e:
+                logger.error(f"Exception in {domain}/{example_id}: {e}")
+                # wandb.log({"Exception": wandb.Table(data=[[f"Exception in {domain}/{example_id}: {e}"]], columns=["Error"])})
+                env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
+                with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
+                    f.write(json.dumps({
+                        "Error": f"Time limit exceeded in {domain}/{example_id}"
+                    }))
+                    f.write("\n")
+            # wandb settings
+            # os.mkdir(os.path.join(wandb.run.dir, "results/"))
+            # for file in os.listdir(example_result_dir):
+            #     # move file to just under the root dir
+            #     os.rename(os.path.join(example_result_dir, file), os.path.join(wandb.run.dir, f"./results/{file}"))
+            # wandb.finish()
+
+    env.close()
+    logger.info(f"Average score: {sum(scores) / len(scores)}")
+
+
+def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
+    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
+
+    if not os.path.exists(target_dir):
+        return total_file_json
+
+    finished = {}
+    for domain in os.listdir(target_dir):
+        finished[domain] = []
+        domain_path = os.path.join(target_dir, domain)
+        if os.path.isdir(domain_path):
+            for example_id in os.listdir(domain_path):
+                example_path = os.path.join(domain_path, example_id)
+                if os.path.isdir(example_path):
+                    if "result.txt" not in os.listdir(example_path):
+                        # empty all files under example_id
+                        for file in os.listdir(example_path):
+                            os.remove(os.path.join(example_path, file))
+                    else:
+                        finished[domain].append(example_id)
+
+    if not finished:
+        return total_file_json
+
+    for domain, examples in finished.items():
+        if domain in total_file_json:
+            total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
+
+    return total_file_json
+
+
+def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
+    target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
+    if not os.path.exists(target_dir):
+        print("New experiment, no result yet.")
+        return None
+
+    all_result = []
+
+    for domain in os.listdir(target_dir):
+        domain_path = os.path.join(target_dir, domain)
+        if os.path.isdir(domain_path):
+            for example_id in os.listdir(domain_path):
+                example_path = os.path.join(domain_path, example_id)
+                if os.path.isdir(example_path):
+                    if "result.txt" in os.listdir(example_path):
+                        # empty all files under example_id
+                        try:
+                            all_result.append(float(open(os.path.join(example_path, "result.txt"), "r").read()))
+                        except:
+                            all_result.append(0.0)
+
+    if not all_result:
+        print("New experiment, no result yet.")
+        return None
+    else:
+        print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
+        return all_result
+
+
+if __name__ == '__main__':
+    ####### The complete version of the list of examples #######
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    args = config()
+
+    with open("evaluation_examples/test_all.json", "r", encoding="utf-8") as f:
+        test_all_meta = json.load(f)
+
+    test_file_list = get_unfinished(
+        args.action_space,
+        args.model,
+        args.observation_type,
+        args.result_dir,
+        test_all_meta
+    )
+    left_info = ""
+    for domain in test_file_list:
+        left_info += f"{domain}: {len(test_file_list[domain])}\n"
+    logger.info(f"Left tasks:\n{left_info}")
+
+    get_result(args.action_space,
+        args.model,
+        args.observation_type,
+        args.result_dir,
+        test_all_meta
+    )
+    test(args, test_file_list)
diff --git a/settings.json b/settings.json
new file mode 100644
index 0000000..7ee7a21
--- /dev/null
+++ b/settings.json
@@ -0,0 +1,3 @@
+{
+    "time_limit": "600"
+}
\ No newline at end of file