Merge branch 'main' into zdy

2024-01-02 21:40:54 +08:00
parent 6e6ef03bc9 3bf6599e68
commit 9726ca9a7f
63 changed files with 7664 additions and 151 deletions
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -78,7 +78,8 @@ class DesktopEnv(gym.Env):
        self.evaluator = task_config["evaluator"]
        self.metric: Metric = getattr(metrics, self.evaluator["func"])
        self.result_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
-        self.expected_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+        self.expected_getter: Getter = getattr(getters, "get_{:}".format(
+            self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
        self.metric_options: Dict[str, Any] = self.evaluator.get("options", {})

        # Initialize emulator and controller
@@ -95,7 +96,7 @@ class DesktopEnv(gym.Env):

        # episodic stuffs, like tmp dir and counters, will be updated or reset
        # when calling self.reset()
-        self.tmp_dir: str = self.tmp_dir_base # just an init value, updated during reset
+        self.tmp_dir: str = self.tmp_dir_base  # just an init value, updated during reset
        self._traj_no: int = -1
        self._step_no: int = 0
        self.action_history: List[Dict[str, any]] = []
@@ -165,7 +166,8 @@ class DesktopEnv(gym.Env):
            self.evaluator = task_config["evaluator"]
            self.metric: Metric = getattr(metrics, self.evaluator["func"])
            self.result_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
-            self.expected_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
+            self.expected_getter: Getter = getattr(getters, "get_{:}".format(
+                self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
            self.metric_options = self.evaluator.get("options", {})

            self.setup_controller.reset_cache_dir(self.cache_dir)
@@ -226,20 +228,12 @@ class DesktopEnv(gym.Env):
        """
        Evaluate whether the task is successfully completed.
        """
+        result_state = self.result_getter(self, self.evaluator["result"])
+        expected_state = self.expected_getter(self, self.evaluator["expected"]) if "expected" in self.evaluator \
+            else None

-        # todo: make this more flexible by refactoring
-        # eval_func = eval_funcs[self.evaluator["func"]]
-        # eval_func_vars = {}
-        #
-        # for var_name, file_info in self.evaluator["paths"].items():
-        # path = copy_file_to_local(file_info)
-        # eval_func_vars[var_name] = path
-        #
-        # return eval_func(**eval_func_vars)
-
-        result = self.result_getter(self, self.evaluator["result"])
-        expected = self.expected_getter(self, self.evaluator["expected"])
-        metric: float = self.metric(result, expected, **self.metric_options)
+        metric: float = self.metric(result_state, expected_state, **self.metric_options) if expected_state is not None \
+            else self.metric(result_state, **self.metric_options)

        return metric

--- a/desktop_env/evaluators/getters/file.py
+++ b/desktop_env/evaluators/getters/file.py
@@ -41,3 +41,4 @@ def get_vm_file(env, config: Dict[str, str]) -> str:
        f.write(file)

    return _path
+
--- a/desktop_env/evaluators/getters/misc.py
+++ b/desktop_env/evaluators/getters/misc.py
@@ -5,4 +5,4 @@ def get_rule(env, config: R) -> R:
    """
    Returns the rule as-is.
    """
-    return config
+    return config["rules"]
--- a/desktop_env/evaluators/metrics/README.md
+++ b/desktop_env/evaluators/metrics/README.md
@@ -0,0 +1,138 @@
+# Setup Instructions
+
+## LibreOffice Writer
+
+### Setting Up the python-docx Library
+```shell
+pip install python-docx
+```
+
+## Chrome
+
+### Starting Chrome with Remote Debugging for Python
+
+To enable remote debugging in Chrome, which allows tools like Playwright for Python to connect to and control an existing Chrome instance, follow these steps:
+
+#### Manually Enabling Remote Debugging in Chrome
+
+1. **Locate the Chrome Shortcut**:
+   - Find the Chrome shortcut that you usually use to open the browser. This could be on your desktop, start menu, or taskbar.
+
+2. **Edit Shortcut Properties**:
+   - Right-click on the Chrome shortcut and select `Properties`.
+
+3. **Modify the Target Field**:
+   - In the `Target` field, add `--remote-debugging-port=9222` at the end of the path. Ensure there is a space between the path and the flag you add.
+   - It should look something like this: `"C:\Path\To\Chrome.exe" --remote-debugging-port=9222`.
+
+4. **Apply and Close**:
+   - Click `Apply` and then `OK` to close the dialog.
+
+5. **Start Chrome**:
+   - Use this modified shortcut to start Chrome. Chrome will now start with remote debugging enabled on port 9222.
+
+6. **Confirm Remote Debugging**:
+   - Open a browser and navigate to `http://localhost:9222`. If you see a webpage with information about active tabs, remote debugging is working.
+
+---
+
+### Setting Up Playwright for Python
+
+Playwright for Python is a browser automation library to control Chromium, Firefox, and WebKit with a single API.
+
+#### Installing Playwright
+
+- Ensure you have Python installed on your system. If not, download and install it from the [Python official website](https://www.python.org/).
+
+- Install Playwright using pip (Python Package Installer). Open a command line or terminal and run:
+
+  ```bash
+  pip install playwright
+  ```
+
+- After installing Playwright, you need to run the install command to download the necessary browser binaries:
+
+  ```bash
+  playwright install
+  ```
+
+#### Writing a Playwright Script in Python
+
+- Create a Python file for your automation script.
+
+- Import the Playwright module at the beginning of your script:
+
+  ```python
+  from playwright.sync_api import sync_playwright
+  ```
+
+- You can now use Playwright's API to control browsers.
+
+#### Example Playwright Script
+
+Here is a simple example to open a page using Playwright:
+
+```python
+from playwright.sync_api import sync_playwright
+
+def run(playwright):
+    browser = playwright.chromium.launch()
+    page = browser.new_page()
+    page.goto("http://example.com")
+    ## other actions...
+    browser.close()
+
+with sync_playwright() as playwright:
+    run(playwright)
+```
+
+- This script launches Chromium, opens a new page, navigates to `example.com`, and then closes the browser.
+
+#### Troubleshooting
+
+- If you encounter issues with Playwright, ensure that your Python environment is correctly set up and that you have installed Playwright and its dependencies correctly.
+- For detailed documentation, visit the [Playwright for Python Documentation](https://playwright.dev/python/docs/intro).
+
+
+## VLC Media Player
+### Setting Up VLC's HTTP Interface
+
+To enable and use the HTTP interface in VLC Media Player for remote control and status checks, follow these steps:
+
+#### 1. Open VLC Preferences
+
+- Open VLC Media Player.
+- Go to `Tools` > `Preferences` from the menu.
+
+#### 2. Show All Settings
+
+- In the Preferences window, at the bottom left corner, select `All` under `Show settings` to display advanced settings.
+
+#### 3. Enable Main Interfaces
+
+- In the advanced preferences, expand the `Interface` section.
+- Click on `Main interfaces`.
+- Check the box for `Web` to enable the HTTP interface.
+
+#### 4. Configure Lua HTTP
+
+- Expand the `Main interfaces` node and select `Lua`.
+- Under `Lua HTTP`, set a password in the `Lua HTTP` section. This password will be required to access the HTTP interface.
+
+#### 5. Save and Restart VLC
+
+- Click `Save` to apply the changes.
+- Restart VLC Media Player for the changes to take effect.
+
+#### 6. Accessing the HTTP Interface
+
+- Open a web browser and go to `http://localhost:8080`.
+- You will be prompted for a password. Enter the password you set in the Lua HTTP settings.
+- Once logged in, you will have access to VLC's HTTP interface for remote control.
+
+#### Troubleshooting
+
+- If you cannot access the HTTP interface, check if your firewall or security software is blocking the connection.
+- Ensure VLC is running and the correct port (default is 8080) is being used.
+- If the port is in use by another application, you may change the port number in VLC's settings.
+
--- a/desktop_env/evaluators/metrics/init.py
+++ b/desktop_env/evaluators/metrics/init.py
@@ -1,5 +1,7 @@
 from .table import compare_table
 from .table import check_sheet_list, check_xlsx_freeze, check_xlsx_zoom
-from .docs import find_default_font, contains_page_break, compare_docx_files
+from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, compare_insert_equation
+from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers
+from .docs import is_first_line_centered, check_file_exists, compare_contains_image
 from .pdf import check_pdf_pages
 from .libreoffice import check_libre_locale
--- a/desktop_env/evaluators/metrics/chrome.py
+++ b/desktop_env/evaluators/metrics/chrome.py
@@ -3,6 +3,8 @@ import os
 import platform
 import sqlite3

+from playwright.sync_api import sync_playwright
+
 """
 WARNING: 
 1. Functions from this script assume that no account is registered on Chrome, otherwise the default file path needs to be changed.
@@ -12,6 +14,7 @@ WARNING:

 # todo: move to getter module

+# The following ones just need to load info from the files of software, no need to connect to the software
 def get_default_search_engine():
    if platform.system() == 'Windows':
        preference_file_path = os.path.join(os.getenv('LOCALAPPDATA'),
@@ -19,8 +22,10 @@ def get_default_search_engine():
    elif platform.system() == 'Darwin':
        preference_file_path = os.path.join(os.getenv('HOME'),
                                            'Library/Application Support/Google/Chrome/Default/Preferences')
-    else:
+    elif platform.system() == 'Linux':
        preference_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences')
+    else:
+        raise Exception('Unsupported operating system')

    try:
        with open(preference_file_path, 'r', encoding='utf-8') as file:
@@ -41,8 +46,10 @@ def get_cookie_data():
    elif platform.system() == 'Darwin':
        chrome_cookie_file_path = os.path.join(os.getenv('HOME'),
                                               'Library/Application Support/Google/Chrome/Default/Cookies')
-    else:
+    elif platform.system() == 'Linux':
        chrome_cookie_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Cookies')
+    else:
+        raise Exception('Unsupported operating system')

    try:
        conn = sqlite3.connect(chrome_cookie_file_path)
@@ -65,8 +72,10 @@ def get_bookmarks():
    elif platform.system() == 'Darwin':
        preference_file_path = os.path.join(os.getenv('HOME'),
                                            'Library/Application Support/Google/Chrome/Default/Bookmarks')
-    else:
+    elif platform.system() == 'Linux':
        preference_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Bookmarks')
+    else:
+        raise Exception('Unsupported operating system')

    try:
        with open(preference_file_path, 'r', encoding='utf-8') as file:
@@ -78,3 +87,75 @@ def get_bookmarks():
    except Exception as e:
        print(f"Error: {e}")
        return None
+
+
+def get_extensions_installed_from_shop():
+    """Find the Chrome extensions directory based on the operating system."""
+    os_name = platform.system()
+    if os_name == 'Windows':
+        chrome_extension_dir = os.path.expanduser(
+            '~') + '\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Extensions\\'
+    elif os_name == 'Darwin':  # macOS
+        chrome_extension_dir = os.path.expanduser(
+            '~') + '/Library/Application Support/Google/Chrome/Default/Extensions/'
+    elif os_name == 'Linux':
+        chrome_extension_dir = os.path.expanduser('~') + '/.config/google-chrome/Default/Extensions/'
+    else:
+        raise Exception('Unsupported operating system')
+
+    manifests = []
+    for extension_id in os.listdir(chrome_extension_dir):
+        extension_path = os.path.join(chrome_extension_dir, extension_id)
+        if os.path.isdir(extension_path):
+            # Iterate through version-named subdirectories
+            for version_dir in os.listdir(extension_path):
+                version_path = os.path.join(extension_path, version_dir)
+                manifest_path = os.path.join(version_path, 'manifest.json')
+                if os.path.isfile(manifest_path):
+                    with open(manifest_path, 'r') as file:
+                        try:
+                            manifest = json.load(file)
+                            manifests.append(manifest)
+                        except json.JSONDecodeError:
+                            print(f"Error reading {manifest_path}")
+    return manifests
+
+
+# The following ones require Playwright to be installed on the target machine, and the chrome needs to be pre-config on port info to allow remote debugging, see README.md for details
+
+def get_open_tabs_info(remote_debugging_url):
+    with sync_playwright() as p:
+        # connect to remote Chrome instance
+        browser = p.chromium.connect_over_cdp(remote_debugging_url)
+
+        tabs_info = []
+        for context in browser.contexts:
+            for page in context.pages:
+                title = page.title()
+                url = page.url
+                tabs_info.append({'title': title, 'url': url})
+
+        browser.close()
+        return tabs_info
+
+
+def get_active_tab_info(remote_debugging_url):
+    with sync_playwright() as p:
+        # connect to remote Chrome instance
+        browser = p.chromium.connect_over_cdp(remote_debugging_url)
+
+        active_tab_info = {}
+        for context in browser.contexts:
+            for page in context.pages():
+                if page.is_visible("body"):  # check the visibility of the page body to determine the active status
+                    active_tab_info = {
+                        'title': page.title(),
+                        'url': page.url,
+                        'content': page.content()  # get the HTML content of the page
+                    }
+                    break
+            if active_tab_info:
+                break
+
+        browser.close()
+        return active_tab_info
--- a/desktop_env/evaluators/metrics/docs.py
+++ b/desktop_env/evaluators/metrics/docs.py
@@ -1,15 +1,20 @@
 import xml.etree.ElementTree as ET
-    
+import os
+from typing import List, Dict, Any
 from docx import Document
+from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

-def find_default_font(expected, config_file_path):
+
+def find_default_font(config_file_path, rules):
    """Find the default font in LibreOffice Writer."""
    default_font = None
+    expected_font = rules["font_name"]
+
    try:
        tree = ET.parse(config_file_path)
        root = tree.getroot()
-        
-       # Define the XML namespace used in the file
+
+        # Define the XML namespace used in the file
        namespace = {'oor': 'http://openoffice.org/2001/registry'}

        # Search for the node containing the default font setting for LibreOffice Writer
@@ -19,24 +24,26 @@ def find_default_font(expected, config_file_path):
                    default_font = value.text
    except Exception as e:
        print(f"Error: {e}")
-    return 1 if default_font == expected else 0
+
+    return 1 if default_font == expected_font else 0


 def contains_page_break(docx_file):
    doc = Document(docx_file)
-    
+
    namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
-    
+
    for paragraph in doc.paragraphs:
        for run in paragraph.runs:
            br_elems = run.element.findall('.//w:br', namespaces)
            for br in br_elems:
-                if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
+                if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and \
+                        br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
                    return 1
    return 0

-def compare_docx_files(file1, file2):

+def compare_docx_files(file1, file2):
    doc1 = Document(file1)
    doc2 = Document(file2)

@@ -53,6 +60,129 @@ def compare_docx_files(file1, file2):

    return 1

+
+def compare_docx_tables(docx_file1, docx_file2):
+    doc1 = Document(docx_file1)
+    doc2 = Document(docx_file2)
+
+    # get list of tables in docx
+    tables1 = doc1.tables
+    tables2 = doc2.tables
+
+    if len(tables1) != len(tables2):
+        return 0
+
+    # Compare each table content
+    for table1, table2 in zip(tables1, tables2):
+
+        if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
+            return 0
+
+        # Compare each cell
+        for i in range(len(table1.rows)):
+            for j in range(len(table1.columns)):
+                if table1.cell(i, j).text != table2.cell(i, j).text:
+                    return 0
+
+    return 1
+
+
+def compare_line_spacing(docx_file1, docx_file2):
+    doc1 = Document(docx_file1)
+    doc2 = Document(docx_file2)
+
+    if len(doc1.paragraphs) != len(doc2.paragraphs):
+        return 0
+
+    # Compare each paragraph line spacing
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+
+        spacing1 = para1.paragraph_format.line_spacing
+        spacing2 = para2.paragraph_format.line_spacing
+
+        if spacing1 != spacing2:
+            return 0
+
+    return 1
+
+
+def compare_insert_equation(docx_file1, docx_file2):
+    if not compare_docx_files(docx_file1, docx_file2):
+        return 0
+
+    doc1 = Document(docx_file1)
+    doc2 = Document(docx_file2)
+
+    # Compare each paragraph if it contains equation
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            if run1.element.xpath('.//w:object') and run2.element.xpath('.//w:object'):
+                return 1
+    return 0
+
+
+def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
+    doc = Document(docx_file)
+    expected_font = rules["font_name"]
+
+    for paragraph in doc.paragraphs:
+        for run in paragraph.runs:
+            font_name = run.font.name
+            if font_name != expected_font:
+                return 0
+    return 1
+
+
+def compare_subscript_contains(docx_file1, docx_file2):
+    doc1 = Document(docx_file1)
+    doc2 = Document(docx_file2)
+
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            # check if two paras both contain subscript
+            if run1.font.subscript and run2.font.subscript:
+                return 1
+    return 0
+
+
+def has_page_numbers_in_footers(docx_file):
+    doc = Document(docx_file)
+
+    for section in doc.sections:
+        footer = section.footer
+        if footer is None:
+            return 0
+        footer_text = footer.paragraphs[0].text if footer.paragraphs else ''
+        if not any(char.isdigit() for char in footer_text):
+            # if no digit in footer, then no page number
+            return 0
+    return 1
+
+
+def is_first_line_centered(docx_file):
+    doc = Document(docx_file)
+    first_paragraph = doc.paragraphs[0]
+
+    # check if the first line is center justified
+    return 1 if first_paragraph.paragraph_format.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER else 0
+
+
+def check_file_exists(directory, filename):
+    file_path = os.path.join(directory, filename)
+    return 1 if os.path.isfile(file_path) else 0
+
+
+def compare_contains_image(docx_file1, docx_file2):
+    doc1 = Document(docx_file1)
+    doc2 = Document(docx_file2)
+
+    for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
+        for run1, run2 in zip(para1.runs, para2.runs):
+            if ('graphicData' in run1._element.xml and 'graphicData' not in run2._element.xml) or (
+                    'graphicData' not in run1._element.xml and 'graphicData' in run2._element.xml):
+                return 0
+    return 1
+
 # file1 = 'path/to/file1.docx'
 # file2 = 'path/to/file2.docx'

@@ -60,6 +190,6 @@ def compare_docx_files(file1, file2):
 # Replace 'your_document.docx' with the path to your document
 # result = contains_page_break('your_document.docx')
 # print(result)
-    
-#config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
-#print(find_default_font("Ani", config_path))
+
+# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
+# print(find_default_font("Ani", config_path))
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -1,5 +1,7 @@
-import os
 import platform
+import subprocess
+import ctypes
+import os


 # todo: move to getter module
@@ -13,3 +15,43 @@ def get_desktop_path():
        return os.path.join("/home", username, "Desktop")
    else:
        raise Exception("Unsupported operating system")
+
+
+def get_wallpaper():
+    def get_wallpaper_windows():
+        SPI_GETDESKWALLPAPER = 0x73
+        MAX_PATH = 260
+        buffer = ctypes.create_unicode_buffer(MAX_PATH)
+        ctypes.windll.user32.SystemParametersInfoW(SPI_GETDESKWALLPAPER, MAX_PATH, buffer, 0)
+        return buffer.value
+
+    def get_wallpaper_macos():
+        script = """
+        tell application "System Events" to tell every desktop to get picture
+        """
+        process = subprocess.Popen(['osascript', '-e', script], stdout=subprocess.PIPE)
+        output, error = process.communicate()
+        if error:
+            print("Error:", error)
+        else:
+            return output.strip().decode('utf-8')
+
+    def get_wallpaper_linux():
+        try:
+            output = subprocess.check_output(["gsettings", "get", "org.gnome.desktop.background", "picture-uri"])
+            return output.decode('utf-8').strip().replace('file://', '').replace("'", "")
+        except Exception as e:
+            print("Error:", e)
+            return None
+
+    os_name = platform.system()
+    if os_name == 'Windows':
+        return get_wallpaper_windows()
+    elif os_name == 'Darwin':
+        return get_wallpaper_macos()
+    elif os_name == 'Linux':
+        return get_wallpaper_linux()
+    else:
+        return "Unsupported OS"
+
+
--- a/desktop_env/evaluators/metrics/vlc.py
+++ b/desktop_env/evaluators/metrics/vlc.py
@@ -0,0 +1,87 @@
+import os
+import platform
+import requests
+from xml.etree import ElementTree
+import pygetwindow as gw
+import pyautogui
+
+def read_vlc_config(setting_name):
+    """
+    Reads the VLC configuration file to check for a specific setting.
+
+    # Example usage
+    setting_name = 'recordings_folder='
+    setting = read_vlc_config(setting_name)
+    """
+    # Common paths for VLC config file on different operating systems
+    paths = {
+        'Windows': os.path.expanduser('~\\AppData\\Roaming\\vlc\\vlcrc'),
+        'Darwin': os.path.expanduser('~/Library/Preferences/org.videolan.vlc/vlcrc'),
+        'Linux': os.path.expanduser('~/.config/vlc/vlcrc')
+    }
+
+    os_type = platform.system()
+    config_path = paths.get(os_type)
+
+    if not config_path or not os.path.exists(config_path):
+        print("VLC config file not found for this operating system.")
+        return None
+
+    try:
+        with open(config_path, 'r', encoding="utf-8") as file:
+            for line in file:
+                if line.startswith(setting_name):
+                    return line.strip()
+    except IOError as e:
+        print(f"Error reading config file: {e}")
+
+    return None
+
+
+def get_vlc_playing_info(host='localhost', port=8080, password='password'):
+    """
+    Gets the current playing information from VLC's HTTP interface.
+    """
+    url = f'http://{host}:{port}/requests/status.xml'
+    try:
+        response = requests.get(url, auth=('', password))
+        if response.status_code == 200:
+            tree = ElementTree.fromstring(response.content)
+            status = tree.find('state').text
+            if status == 'playing':
+                file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
+                return status, file_info
+            return status, None
+    except Exception as e:
+        print(f"Error: {e}")
+
+    return None, None
+
+
+def is_vlc_fullscreen():
+    """
+    Checks if the VLC window is in full-screen mode.
+
+    When VLC is in full-screen mode, its window size matches the screen size with no borders.
+    """
+    try:
+        # Get the VLC window; adjust the title as per your VLC window's title
+        vlc_window = gw.getWindowsWithTitle('VLC media player')[0]  # Adjust title if needed
+        if not vlc_window:
+            return False
+
+        # Get screen size
+        screen_width, screen_height = pyautogui.size()
+
+        # Check if VLC window size matches the screen size
+        return (vlc_window.width == screen_width and vlc_window.height == screen_height)
+
+    except IndexError:
+        # VLC window not found
+        print("VLC window not found.")
+        return False
+    except Exception as e:
+        print(f"An error occurred: {e}")
+        return False
+
+
--- a/evaluation_examples/examples/libreoffice_writer/0810415c-bde4-4443-9047-d5f70165a697.json
+++ b/evaluation_examples/examples/libreoffice_writer/0810415c-bde4-4443-9047-d5f70165a697.json
@@ -3,10 +3,40 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Make the line spacing of first two paragraph into double line spacing",
  "source": "https://www.youtube.com/watch?v=Q_AaL6ljudU",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1-svVsH-l2ofufEKuN-cYrIrvXNobtATE&export=download&authuser=0&confirm=t&uuid=be7f891a-f858-48f5-a72d-4e42bbfb8b65&at=APZUnTXzBnaeSJjmxeh4zG03pzA0:1704179807785",
+            "path": "Desktop/Double_Line_Spacing.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Double_Line_Spacing.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_line_spacing",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1-svVsH-l2ofufEKuN-cYrIrvXNobtATE&export=download&authuser=0&confirm=t&uuid=be7f891a-f858-48f5-a72d-4e42bbfb8b65&at=APZUnTXzBnaeSJjmxeh4zG03pzA0:1704179807785",
+      "dest": "Double_Line_Spacing_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Double_Line_Spacing.docx",
+      "dest": "Double_Line_Spacing.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/0b17a146-2934-46c7-8727-73ff6b6483e8.json
+++ b/evaluation_examples/examples/libreoffice_writer/0b17a146-2934-46c7-8727-73ff6b6483e8.json
@@ -1,12 +1,42 @@
 {
  "id": "0b17a146-2934-46c7-8727-73ff6b6483e8",
  "snapshot": "libreoffice_writer",
-  "instruction": "Enter subscript",
-  "source": "https://ask.libreoffice.org/t/how-to-enter-superscript-and-subscript-in-libreoffice-base-forms-reports/23413",
-  "config": [],
+  "instruction": "Change the 2 in H2O to a subscript.",
+  "source": "https://askubuntu.com/questions/245695/how-do-you-insert-subscripts-and-superscripts-into-ordinary-non-formula-text-i",
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1Nx5AoKNM7tcDRE6y_qjNIDrPOKqhNyfm&export=download&authuser=0&confirm=t&uuid=bb4de348-3bbf-46a2-95b2-e2719c67547a&at=APZUnTUeA-BW7mkQsEw7NGm272zx:1704172916742",
+            "path": "Desktop/Enter_Subscript.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Enter_Subscript.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_docx_files",
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Enter_Subscript.docx",
+      "dest": "Enter_Subscript.docx"
+    },
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1AaKXeD9ZgfMykgijZ4G8MEzUjmMJElkq&export=download&authuser=0&confirm=t&uuid=5e347f0d-4efc-4478-878e-d89455d1593b&at=APZUnTWCYWfsD4eCeG52VJiK8-xB:1704172886196",
+      "dest": "Enter_Subscript_Gold.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/0e47de2a-32e0-456c-a366-8c607ef7a9d2.json
+++ b/evaluation_examples/examples/libreoffice_writer/0e47de2a-32e0-456c-a366-8c607ef7a9d2.json
@@ -3,10 +3,35 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Add page number for every page at the bottom left",
  "source": "https://ask.libreoffice.org/t/how-to-start-page-numbering-on-a-certain-page/39931/4",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1aDWe-vAmcfQSgtPjFfrncq8ZFnCy4uUK&export=download&authuser=0&confirm=t&uuid=788af72a-ddaf-4ba3-aedb-96f34cc4d815&at=APZUnTVSRSSfMGcjXqLzvMixnkp6:1704179663299",
+            "path": "Desktop/Add_Page_Number_Bottom_Left.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Add_Page_Number_Bottom_Left.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "has_page_numbers_in_footers",
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Add_Page_Number_Bottom_Left.docx",
+      "dest": "Add_Page_Number_Bottom_Left.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/0e763496-b6bb-4508-a427-fad0b6c3e195.json
+++ b/evaluation_examples/examples/libreoffice_writer/0e763496-b6bb-4508-a427-fad0b6c3e195.json
@@ -3,10 +3,41 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Change the font to \"Times New Roman\" throughout the text.",
  "source": "https://ask.libreoffice.org/t/how-do-i-change-the-font-for-the-whole-document-in-writer/9220",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1fIHNzFm8JabWoLKOnxrFM722fQ1d_huK&export=download&authuser=0&confirm=t&uuid=d11a8dda-1e4e-4dc9-b05c-e6b47624dbf0&at=APZUnTVG0ViFnKJa00314wVr3uP9:1704185871014",
+            "path": "Desktop/Change_Font_Through_File.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Change_Font_Through_File.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_font_names",
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "font_name": "Times New Roman"
+      }
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Change_Font_Through_File.docx",
+      "dest": "Change_Font_Through_File.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/3ef2b351-8a84-4ff2-8724-d86eae9b842e.json
+++ b/evaluation_examples/examples/libreoffice_writer/3ef2b351-8a84-4ff2-8724-d86eae9b842e.json
@@ -3,10 +3,35 @@
  "snapshot": "libreoffice_writer",
  "instruction": "center-justify the first line",
  "source": "https://askubuntu.com/questions/1066351/how-do-you-center-align-in-libreoffice#:~:text=Ctrl%20%2B%20e%20will%20Center%20align%20the%20cursor%20for%20you.",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1IQ4rKyHMOui71YlyL7huggpLYFYtj923&export=download&authuser=0&confirm=t&uuid=014c2335-c0c6-4712-9d5a-ca8d3217e07f&at=APZUnTVrM698NQgSh4hqYXR8cjDc:1704185072996",
+            "path": "Desktop/Centering_First_Line.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Centering_First_Line.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "is_first_line_centered",
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Centering_First_Line.docx",
+      "dest": "Centering_First_Line.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/45d61a06-6545-4422-97b7-bc76cfa964c1.json
+++ b/evaluation_examples/examples/libreoffice_writer/45d61a06-6545-4422-97b7-bc76cfa964c1.json
@@ -3,10 +3,40 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Replace all newlines with paragraph marks in LibreOffice Write",
  "source": "https://stackoverflow.com/questions/71685737/how-to-replace-all-newlines-with-paragraph-marks-in-libreoffice-write",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=18XFjPVUnLG_-KOM5sn-Sk74HP_JHivMy&export=download&authuser=0&confirm=t&uuid=d23041bc-2ddd-42c4-84ae-481b953f021c&at=APZUnTVYh0AK0245qsDOCol7SdMB:1704185512767",
+            "path": "Desktop/Replace_Newlines_with_Paragraph_Marks.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Replace_Newlines_with_Paragraph_Marks.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
+  "evaluator": {
+    "func": "compare_line_spacing",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1bP_noic02MuzrM8CdJIQN7F1gN4N8sel&export=download&authuser=0&confirm=t&uuid=657e0e4f-7b96-4d7e-83f4-99b79c68708f&at=APZUnTX7HsmefsMlzQaCGK2fg5Em:1704185514197",
+      "dest": "Replace_Newlines_with_Paragraph_Marks_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Double_Line_Spacing.docx",
+      "dest": "Replace_Newlines_with_Paragraph_Marks.docx"
+    }
+  }
 }
--- a/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
+++ b/evaluation_examples/examples/libreoffice_writer/4bcb1253-a636-4df4-8cb0-a35c04dfef31.json
@@ -3,10 +3,32 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Export the current document into PDF, keep the file name",
  "source": "https://www.libreofficehelp.com/save-export-writer-documents-in-pdf-epub-format/",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1czb_13MshoiM-yCxvUGYD8OnIIrf-3VX&export=download&authuser=0&confirm=t&uuid=e7c30b67-7fac-4b64-a222-d04bc7c82842&at=APZUnTUA1te5vt7L__zJ7xuMs48e:1704177347643",
+            "path": "Desktop/Save_Writer_PDF.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Save_Writer_PDF.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "check_file_exists",
+    "file_name": "Save_Writer_PDF.pdf",
+    "directory": "/home/user/Downloads/"
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/663876c7-3471-43db-ba51-f410b13d9d7d.json
+++ b/evaluation_examples/examples/libreoffice_writer/663876c7-3471-43db-ba51-f410b13d9d7d.json
@@ -3,10 +3,40 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Insert the equation \"(a + b)^2 = a^2 + 2 a b + b^2\"",
  "source": "https://askubuntu.com/questions/319593/how-to-type-science-equations-in-libre-office",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1FgMp7Ny63eXzeF23qHYhqQux31djlkah&export=download&authuser=0&confirm=t&uuid=d6b5208d-3b3a-4972-a641-ed738a419fdb&at=APZUnTX16Fz8Qg-B0NWpWgC-3Dyu:1704184410221",
+            "path": "Desktop/Insert_Equation.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Insert_Equation.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_insert_equation",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1hMFJnwHs7Iaexz3b9O2LJQUfsJ2wiwZ9&export=download&authuser=0&confirm=t&uuid=2abb49fb-d9c7-46cf-bc21-e69ecb9cefc6&at=APZUnTVzEZjChcUb4MIoxuq4cGea:1704184411805",
+      "dest": "Insert_Equation_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Insert_Equation.docx",
+      "dest": "Insert_Equation.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json
+++ b/evaluation_examples/examples/libreoffice_writer/66399b0d-8fda-4618-95c4-bfc6191617e9.json
@@ -3,10 +3,40 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Insert a 7*5 empty table",
  "source": "https://www.youtube.com/watch?v=l25Evu4ohKg",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1kXBP0jMxTeVahzFLYbYHJtjjgmuzrA8R&export=download&authuser=0&confirm=t&uuid=f8b9bad3-415d-4d39-a4fb-05a4cf881cf0&at=APZUnTXaohwzl8_2RDF_tgUsP9cH:1704181463579",
+            "path": "Desktop/Insert_Empty_Table.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Insert_Empty_Table.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_docx_tables",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=14JfHsW9GvbhORdtVAtvEbOi00MqEyHfb&export=download&authuser=0&confirm=t&uuid=3dba2459-ac37-4cad-a982-adecd406382a&at=APZUnTVQUqUPq_WacgY2xu4PvAKB:1704181465512",
+      "dest": "Insert_Empty_Table_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Insert_Empty_Table.docx",
+      "dest": "Insert_Empty_Table.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
+++ b/evaluation_examples/examples/libreoffice_writer/6ada715d-3aae-4a32-a6a7-429b2e43fb93.json
@@ -1,12 +1,53 @@
 {
  "id": "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
  "snapshot": "libreoffice_writer",
-  "instruction": "Insert the image which is in IMAGE_PATH where my cursor is",
+  "instruction": "Copy the screenshot 1.jpg from the desktop to where my cursor is locatedInsert the image which is in IMAGE_PATH where my cursor is",
  "source": "https://www.quora.com/How-do-you-insert-images-into-a-LibreOffice-Writer-document",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1ebLG0gmqYv41ek4UmKWhFsxBnoUSGjKp&export=download&authuser=0&confirm=t&uuid=8f7d7bee-1fe4-4c4c-8b69-8aaf47199c57&at=APZUnTVYUvYTopUXCVs69QWWwPbq:1704173993139",
+            "path": "Desktop/Insert_Image_At_Cursor.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1QfjQ4SKtjKDXWpqa2u6mC_KtB3ASEK5O&export=download&authuser=0&confirm=t&uuid=06af00b9-58f3-4691-a6a3-34309c80cbbb&at=APZUnTVZpE1lMxcvGG0cdt5zuxZ_:1704174003198",
+            "path": "Desktop/1.jpg"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Insert_Image_At_Cursor.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_contains_image",
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Insert_Image_At_Cursor.docx",
+      "dest": "Insert_Image_At_Cursor.docx"
+    },
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1xbhlfqGrPutHHi2aHg66jwXD-yaZpe9j&export=download&authuser=0&confirm=t&uuid=427765e0-3f97-4a72-92db-a1fe7cdde73b&at=APZUnTUhNLh2PDu4OGkCVQW-LPCd:1704173991269",
+      "dest": "Insert_Image_At_Cursor_Gold.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
+++ b/evaluation_examples/examples/libreoffice_writer/936321ce-5236-426a-9a20-e0e3c5dc536f.json
@@ -3,10 +3,40 @@
  "snapshot": "libreoffice_writer",
  "instruction": "Convert the content seperated by commas to a table",
  "source": "https://www.youtube.com/watch?v=l25Evu4ohKg",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=139-NmslBR_9qlD7hUO08xj_VFffV95eM&export=download&authuser=0&confirm=t&uuid=64a6c35d-f3ce-4c25-9f83-4a952e24c5ad&at=APZUnTUL1GMR_QbpFQnC9fPwkdqa:1704183959196",
+            "path": "Desktop/Convert_Text_To_Table.docx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Convert_Text_To_Table.docx"
+      }
+    }
+  ],
  "trajectory": "trajectories/",
  "related_apps": [
    "libreoffice_writer"
  ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_docx_tables",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1fs2msYaxnEZL9XASENQMZIag2MTxIBJs&export=download&authuser=0&confirm=t&uuid=6c71f008-082c-4f0c-9ffc-0a802f5cbfe6&at=APZUnTVDpucMDfk5P2T-0dZx_KVV:1704183960360",
+      "dest": "Convert_Text_To_Table_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Convert_Text_To_Table.docx",
+      "dest": "Convert_Text_To_Table.docx"
+    }
+  }
+}
--- a/evaluation_examples/examples/libreoffice_writer/adf5e2c3-64c7-4644-b7b6-d2f0167927e7.json
+++ b/evaluation_examples/examples/libreoffice_writer/adf5e2c3-64c7-4644-b7b6-d2f0167927e7.json
@@ -1,7 +1,7 @@
 {
  "id": "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
  "snapshot": "libreoffice_writer",
-  "instruction": "Helping me adding CITATION_TEXT to my reference list, and add a cross reference after the word \"WHERE_WE_ADD_REFERENCE\"",
+  "instruction": "Helping me adding \"C. Luo and M. J. Carey, \"LSM-based storage techniques: a survey,\" The VLDB Journal, vol. 29, no. 1, pp. 393–418, 2020.\" to my reference list, and add a cross reference at the end of the first paragraph",
  "source": "https://seekstar.github.io/2022/04/11/libreoffice%E5%BC%95%E7%94%A8%E6%96%87%E7%8C%AE/",
  "config": [
    {
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://drive.google.com/uc?export=download&id=1boNZ3JuqUx2apMqExL3UX9A4Sopi77ke7yKnIO3cpbg",
+            "url": "https://drive.usercontent.google.com/download?id=1xOfwImgkPzdmjQomj-MCFd8nQS75OjaH&export=download&authuser=0&confirm=t&uuid=7eb91c26-dad5-4480-b1ec-35e506cde1e4&at=APZUnTW01MvBI_gkC8yoiyAVs7yi:1704188254979",
            "path": "Desktop/Add_Citation_Cross_Reference.docx"
          }
        ]
@@ -30,13 +30,13 @@
    "func": "compare_docx_files",
    "expected": {
      "type": "cloud_file",
-      "path": "https://doc-14-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/erhjsu6vpod00o7ruf95hlnqkk/1703931360000/108888117743638485671/108888117743638485671/1tiN6A9-zQ2gfDPTWAYK5mv5JbUe2Y_PqPIyuvsu7PgA?format=docx&dat=AOBvIb1uRgycIK4pNup7ZrnJqwNjYOgTUlrhxAc8DnBWzUt9zDxLm3e4s0KQytzQ1qvFZaBr8-ymrVv7Mmb7ovpVk4k8sgS_2MRD1m-tMUDiUEGFtoxrECd4Xoaspuwb-BZttyU1cCdY3U12qcNWy5Cts_uys6ouKZok01Z7s1J233udfrMbXvDt_X-HeNo_7e6Bh64ZC4ohHOKZddsuayKYxPTKpgnho_8FPuWXqZDKyfYRDoTXxGWv-WrZSVqRSHP6GMtBdWc1-QBuWzH_iRTM64joeveSDppMjMeB5bjdJQ7EXf-EjA8MjSxtvQQGBmun7PoZ-W7fLmQ1E3fZKJ5BwQDOIJHDCBar83iHHoXOUJ1Q5UbkKcCS0nJ_pprCzRYXLSeVfN0_bdGuY2lSE8GhX-yGlyGIjAIZK-YulOFXwV0--4aD10rh43A5GLmSLeNZe6maUU33j1V-zUtp1qPgRk3SnPJENNOXf-sOYAvQqSgROSBvAwElqgHUMD_ROK692M7_7OtFe4sjs0eVnBzROEHy-ZznXqdSXJj6-2vloXHWfswPfE-Mq5kc7F1zX4CY6H1kQ-zgHzeLX-qQA6YmgZPJ0pLzFkAkBiMAjPigA_2dy7jk-niePSbZ9DcgYoX6iv6MkJ0y6JE_HQF7Gr6kDBiOjOyDp7gFoMj35F41Fac1wpSJmoiUEGLg0qGRBZ6BPc54m-AAFuy-2s4BUUtPgk-FlTD1jSpHDXLbJ-VQFglx1CYpfqFAnmIE8yseQPh3GqQYyCtCfD-zzO-CRTT9A-XOQVuH27npfk2gMDKtGwJr7XhNL8lL9b8540uTjt9nFnmNfDZCFK01VULdHZesSBedqM4iApgVVnjok8nmYw14e9WSgJOdjeiYAwI",
+      "path": "https://drive.usercontent.google.com/download?id=1wFQU7hkAT2wmSHTgM22F9Ep4WXmymEMW&export=download&authuser=0&confirm=t&uuid=a7ea1eec-678b-4407-b023-df13cc6f8c54&at=APZUnTW3WoqOfS9A1BW79XfV8jKh:1704188260410",
      "dest": "Add_Citation_Cross_Reference_Gold.docx"
    },
    "result": {
      "type": "vm_file",
-      "path": "Desktop/Add_Citation_Cross_Reference.xlsx",
-      "dest": "Add_Citation_Cross_Reference.xlsx"
+      "path": "Desktop/Add_Citation_Cross_Reference.docx",
+      "dest": "Add_Citation_Cross_Reference.docx"
    }
  }
 }
--- a/evaluation_examples/examples/libreoffice_writer/e528b65e-1107-4b8c-8988-490e4fece599.json
+++ b/evaluation_examples/examples/libreoffice_writer/e528b65e-1107-4b8c-8988-490e4fece599.json
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://doc-08-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/50aih0mm4k09m25qfa3qrhiuk4/1703929965000/108888117743638485671/108888117743638485671/1NcJF7vVondy_r7toxlB9hLPAZa_SE2J8Q8jwMXS8VUA?format=docx&dat=AOBvIb2I6t_SHXA1k8jP6BwfHmYlD9vYHbrSA2-mYotTRWVDp45o4YRrwJXoy21qBnq92d646pP7IQH6-gXTi7oDIcGqD0iYR6CLD1s2PyYX2F8wQ703_cw61GoVYGf8BoorXDY6Y44dfXY0j2RigWDdbimS1rSLUy3TGEmTl8jZq71zrNUKiS25zCsfuXONsexH0tGI1d8LfnVKrFLCQvIlrVF7lV9lgi4lJhuUwIIKF1JdziNoNBohbCuhv-h4iGPRyoFxC4hZAOVJEHy1wIvBA64rNsw1N_nplLxx42I7MC9-3F24Lkxi3xfJ81nEYSx8ma5D9V_AHLLRLmIrpPKYk1s47qPQxbSGrcO1362WJeMxb8lys71APnPwfWbodnxZLdJR2x2WfdYiQWpZGRzBf3-CeQmORrMQDSwWOHsEGMiCw8qTKevzhY1s4aZWBxpQO7ocCoL1gLOxxEvj4eSLvxp2S1u_dFjjr-dcMxt9-Xu210BGd-1Q6kUYzexRuI6I1vkWxDFn7GHgkVf-RhbMT52W_FFOo2Um4rXIfV62W5_nZrmJjz6KNOGdAbIJdkmTrS_lESb6GDmkOFwNarmTlZVOCDN-On7HGaYF1KvX0hobR0559-wKetJj2diqCDOlDXFemtkzvX-CDCRBnDmQxq1ZaQEsjAHhu7sE9jlZT0ywUHV3VpKBcepolqaCRAhX0gCf1cSht7LDHODeX9Hbn3tz810aYlETCJQo9QScbN87i4IV3qFbezwymMi0ZDLgWW0BtEa1gFMY89om6YGscnCHUHGCGy_GW2XscOiQq3rJngCmuu4Ivfta_7GB0e9NeflOIO3wlCpTlw6aQVh9sIB0MMTpDaZ6V1SXSnPInj15tLbCiAPXzNxfg-8",
+            "url": "https://drive.usercontent.google.com/download?id=1LZ1_U9CyR8oOkqfbY6HMLWr4wBVYhCLR&export=download&authuser=0&confirm=t&uuid=574d533a-8df9-4f33-bebd-689b623f27a9&at=APZUnTVruCDRxY661_PVT9BA3839:1704180420603",
            "path": "Desktop/Capitalize_First_Letter.docx"
          }
        ]
@@ -30,13 +30,13 @@
    "func": "compare_docx_files",
    "expected": {
      "type": "cloud_file",
-      "path": "https://doc-10-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/tj4rm8e6bt50lht09qgev340nk/1703930035000/108888117743638485671/108888117743638485671/1vmb6LphTM8jKf2K7fkrmeWhy_wEgMtvthJx7KFtBwhE?format=docx&dat=AOBvIb1sqCYIZBDWtevTgWhVtsxSMJl1RuUy-MFBRUZszTYkE1Y_qhzC_RfgEvMx5kJz_GECf3mFQkPQq6Re7HFsKVaDU4_KXQE13ZM1cCIEtAEJ7UKZPcI55xO_dMh7ig_1wArgt_jviaEmA5RJcdLHu-omc7W23lcTfPZQpZTextkU7vtgQJGceYeC25JIdRPpsTYVXvqhD5Bjtq-ArRQO4f3huW-TfFKdiVh3MFjkuMx8fMg3l60l8JH_lUqw2BqCqzQDVeD_ajYrmzrFQMP5rFXj353S5HtCiAdSlClI1I6nRMLAELwtsgkqEIc5pwNxcOUKZU1lHkCl1wljzOkrLxRSPlQ1Hb2h0YbRVbPARBB6ywe5QooHn9HatQr_4hkzMTRug4Qv-fo39-F5Uy5bNeGPlK4tDtOUPUDUQs0Kbnn_gT9zzSUCXj4BW85tmNtCNc-Akt4_tPXGyEqlNELpFeBaK27EETF6S93N7C5OU9SfYbL8u29YgTLq1229JmJ3dcUr8yDv2oFLx9x_PNbAStSYABZaDCi1B5B2gPSUvxdQ7CtkoFodD0e7XwBWqDi3jC1N2LdBa8mUsIkFVJvI3PmixODcgzJb5MTkKBwWKHw0UqV-Zsl2whtWEEMeeu6HdgsIiuzSs56dUDsOIJXhu2PfIojjyoX91-NeffGEVQ5-w9l3_EfNpOUHLli3_Ju8w5YvjNoS9gU-g2HTdljnWydN0j0jiz1otjiE0oQxMzVqvWNMa3Qap2vPvQMVoOB_7SwBzcEVmi-SnitWvrXIXs3o585Qc6MBeDQ20D0VhJGsFJ8vVqxtDI8AOIC-t8NaYatFoKXuQLJckJ1wcqA7NmFxWa2hWU79l6dwPztsK9w0VJQyMSwJOMFPXWU",
+      "path": "https://drive.usercontent.google.com/download?id=1ykIGg48GjYUOSw8t44evShJE2HX7R5w3&export=download&authuser=0&confirm=t&uuid=ed81d6cd-6044-49f1-be86-35adeaeeea00&at=APZUnTUxW8WLyPr-_smA2Mnwpuuv:1704180422490https://drive.usercontent.google.com/download?id=1ykIGg48GjYUOSw8t44evShJE2HX7R5w3&export=download&authuser=0&confirm=t&uuid=ed81d6cd-6044-49f1-be86-35adeaeeea00&at=APZUnTUxW8WLyPr-_smA2Mnwpuuv:1704180422490",
      "dest": "Capitalize_First_Letter_Gold.docx"
    },
    "result": {
      "type": "vm_file",
-      "path": "Desktop/Capitalize_First_Letter.xlsx",
-      "dest": "Capitalize_First_Letter.xlsx"
+      "path": "Desktop/Capitalize_First_Letter.docx",
+      "dest": "Capitalize_First_Letter.docx"
    }
  }
 }
--- a/evaluation_examples/examples/libreoffice_writer/ecc2413d-8a48-416e-a3a2-d30106ca36cb.json
+++ b/evaluation_examples/examples/libreoffice_writer/ecc2413d-8a48-416e-a3a2-d30106ca36cb.json
@@ -1,7 +1,7 @@
 {
  "id": "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
  "snapshot": "libreoffice_writer",
-  "instruction": "Insert a blank page here",
+  "instruction": "Insert a blank page",
  "source": "https://www.quora.com/How-can-I-insert-a-blank-page-on-libreoffice",
  "config": [
    {
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://doc-0s-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/nqen29nemcpds1sk8t0ck303a8/1703926705000/108888117743638485671/108888117743638485671/10KfmCo_A5M04w4TA_qzPkd6vw04K3xzBzluIgeRCXuo?format=docx&dat=AOBvIb208vnoN2v6hNgA1FJ6jqegOR6oyVuKcoeroXXtmMiXWTao5vsteKCqxz2lnj5rioDVslkXyDQPCKYBnAJhtvKsIE8cJ7V2DtYuL9jFKYtJgClq2RsfpGE0hgvlLuoLZmeaRLaEJv10NTPGRTULPYwgN-RqLdnyG4EojXxrTKxk4TfVM7xAqob7uE24vFyGMH4uMctC2obdXNsDKnZ4dM1eao-3ABtlyPzSVpT891ziY_SMclP7l7y7cLqw4S11zbbnteVYpM5ytRlgpWKFrKvCmX8gRKpT0ENcNlL4ILJdi3KOAhU97X3vQNkS0LyqmAzKjBeuxz5tD3CuAj7LN1xu2t-DaLVvvrZPm_-XHFGvbHdy5wY66HtMqqPmaRb9_898Tl5ODZxfd5XP1CCRw-c8ohA2Jmdl86Scr8XDA7C_mAT8m_E1FLvJLJhJ_TyL74H0-TRiAPA2noaX2PUmOt4G1qFF_aIOn56iPPt3hB3eHvgthD0bVuW3TZUyr6cP4ZM_TF7g9awhXa1xWusltHItieNfNaJOPiI4Lacon_uICbbpSvEhuq5-apCsnwXpKIvK18UKP5u1Fw1Zb8AhAocJpHLxej87mInzYfFr7XAdf1kiPPxh1zRL2yW_Qe-J4YxWn0oBRrNrf_IgfQK_z9QRXgzzS3xaby2AsmA0qMNHIbMT73Uvha0TvO5UxozPc-aejeuaoi7ot27uSAwd0Cd4Yi-d4e7qfqVgNqvLl-psT9ZZ7cWq8vhU2lPiHrlmhVIwiWjf-s57gRNyXN99SY7MLG-b_JhOI43JgzZzfhjMc0EG2UrCxEEiOOGCp57BwH9FjqM1SQSenAlPmy28e8wCShBwZba_WUbwStumKQakIkwYqeoc0VoJN38",
+            "url": "https://drive.usercontent.google.com/download?id=1sDufDSC4foI379-Jikya9WK7FBUSqgrt&export=download&authuser=0&confirm=t&uuid=0abd82d6-2b2c-49bc-af5e-49bfe1c99278&at=APZUnTURIqTNJcIHBcMP2BxEaGXr:1704174850900",
            "path": "Desktop/Insert_Blank_Page.docx"
          }
        ]
--- a/evaluation_examples/examples/libreoffice_writer/f178a4a9-d090-4b56-bc4c-4b72a61a035d.json
+++ b/evaluation_examples/examples/libreoffice_writer/f178a4a9-d090-4b56-bc4c-4b72a61a035d.json
@@ -9,7 +9,7 @@
      "parameters": {
        "files": [
          {
-            "url": "https://doc-10-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/jq5b3etcrfdc25il9orjsk8jgo/1703926500000/108888117743638485671/108888117743638485671/1ufVZd-Uibt9pVClmK9BceqMR6iQNSekH5ECxysnPehY?format=docx&dat=AOBvIb3K-ByHFQ8OY7SbFlbA41gbWygryhR0tjcDhZuUWmdje6d2VxzZsK00RoorX_LOOjpnln1zFpw9-W1PLbjKMx1-cOGZfuVpqBiL3mOiYLdQPxqqPgrRKjzJzeD0SZOCK96nu8wIGoY-tDVwAoGzf98-lxjDOO1Z3slrW4YeTUPZQ17EusYw75S8FzBIMxW9UGzMPMtubUK_JVrHQOU-ghu8bz0atPRrkB44ysWeF0W063sg03ysAnb1557Ie0p3RgrcMc9aeGtKvQFCo0Tr7BkR93D2klp6M5pDMJekgtUGxurwiEmNeZ6nRhp-bYoev1uesAhGzZONVi_1DtaHvGzL6MGMIzfV5rWtMXbFI1CBwtP00AuF5qFOD6l2wkRVogas48MWOxBCX-bcUHOxezVDmxb0ohfCveIDMq0s8ebY5HggfrE9I8pMs-2GNPABUSr4S7MkRO-2yzy-j8pgTtzO3QRc146gd9Hci6aYoAnBIludK31AsLckcVba-OrEyB7Lx31sfzvdITS8nZ4Cg_JWMV9CugNgF_8w0SprvDMw9vsoEjYaJpY2Z_K445GGENY7dGRQbGmBhLeP9wJBXHsNhObWKV71BrPm2wSOJLrFU2iLa5jLY7mkz7xKhq3e9dDttus9c6A0KPj1f54YAsvZ_SEPbE1WBVzMYPD3MV-6yw2KbKgZxYQ9A0lf87KoffIbA24Y2S97FBuOWJ5ZVN2rz02PbpXyuMf1fcnUb8JpAm6ewwArKqtmIJg20hySiYOtZUgfQvjwBaDrMhQjKGKYiLXIEdGTWVQuuTGQhG8pqd4StbxUsCwdMiFOFVXV0mNNncz3QZEOPF5fgW564KuE9qFClhq620ve61mgg6_3S2kQ9RhHYaShvuI",
+            "url": "https://drive.usercontent.google.com/download?id=1X2XTU2ZFuMXOhm7T400e6AOe6eBYxWzD&export=download&authuser=0&confirm=t&uuid=1318923f-6d54-4148-aa80-a454b9963cec&at=APZUnTU-h1nmcjBO_ytWVxXuh8l9:1704187013730",
            "path": "Desktop/Set_Default_Font.docx"
          }
        ]
@@ -28,10 +28,15 @@
  ],
  "evaluator": {
    "func": "find_default_font",
-    "expected": "Times New Roman",
+    "expected": {
+      "type": "rule",
+      "rules": {
+        "font_name": "Times New Roman"
+      }
+    },
    "result": {
      "type": "vm_file",
-      "path": "/home/[your-username]/.config/libreoffice/4/user/registrymodifications.xcu",
+      "path": "/home/user/.config/libreoffice/4/user/registrymodifications.xcu",
      "dest": "registrymodifications.xcu"
    }
  }
--- a/mm_agents/SoM_agent.py
+++ b/mm_agents/SoM_agent.py
@@ -3,10 +3,9 @@
 import os
 import re
 import base64
-from desktop_env.envs.desktop_env import Action, MouseClick
+import PIL.Image
 import json
 import requests
-from mm_agents.gpt_4v_prompt import SYS_PROMPT

 import torch
 import argparse
@@ -15,7 +14,7 @@ import argparse
 from seem.modeling.BaseModel import BaseModel as BaseModel_Seem
 from seem.utils.distributed import init_distributed as init_distributed_seem
 from seem.modeling import build_model as build_model_seem
-from task_adapter.seem.tasks import interactive_seem_m2m_auto, inference_seem_pano, inference_seem_interactive
+from task_adapter.seem.tasks import inference_seem_pano

 # semantic sam
 from semantic_sam.BaseModel import BaseModel
@@ -28,14 +27,42 @@ from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_sw
 # sam
 from segment_anything import sam_model_registry
 from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto
-from task_adapter.sam.tasks.inference_sam_m2m_interactive import inference_sam_m2m_interactive

 from scipy.ndimage import label
+from io import BytesIO
 import numpy as np

+SYS_PROMPT = '''
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
+
+Firstly you need to predict the class of your action, select from one below:
+- **CLICK**: click on the screen with the specified integer label
+- **TYPE**: type a string on the keyboard
+
+- For CLICK, you need to predict the correct integer label shown on the screenshot
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "label": 7
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You can predict multiple actions at one step, but you should only return one action for each step.
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 '''
-build args
-'''
+
+# build args
 semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml"
 seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml"

@@ -47,9 +74,7 @@ opt_semsam = load_opt_from_config_file(semsam_cfg)
 opt_seem = load_opt_from_config_file(seem_cfg)
 opt_seem = init_distributed_seem(opt_seem)

-'''
-build model
-'''
+# build model
 model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda()
 model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda()
 model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda()
@@ -65,65 +90,54 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs
    elif slider > 2.5:
        model_name = 'sam'
    else:
-        if mode == 'Automatic':
-            model_name = 'semantic-sam'
-            if slider < 1.5 + 0.14:
-                level = [1]
-            elif slider < 1.5 + 0.28:
-                level = [2]
-            elif slider < 1.5 + 0.42:
-                level = [3]
-            elif slider < 1.5 + 0.56:
-                level = [4]
-            elif slider < 1.5 + 0.70:
-                level = [5]
-            elif slider < 1.5 + 0.84:
-                level = [6]
-            else:
-                level = [6, 1, 2, 3, 4, 5]
+        model_name = 'semantic-sam'
+        if slider < 1.5 + 0.14:
+            level = [1]
+        elif slider < 1.5 + 0.28:
+            level = [2]
+        elif slider < 1.5 + 0.42:
+            level = [3]
+        elif slider < 1.5 + 0.56:
+            level = [4]
+        elif slider < 1.5 + 0.70:
+            level = [5]
+        elif slider < 1.5 + 0.84:
+            level = [6]
        else:
-            model_name = 'sam'
+            level = [6, 1, 2, 3, 4, 5]

    if label_mode == 'Alphabet':
        label_mode = 'a'
    else:
        label_mode = '1'

-    text_size, hole_scale, island_scale = 640, 100, 100
+    text_size, hole_scale, island_scale = 1280, 100, 100
    text, text_part, text_thresh = '', '', '0.0'

    with torch.autocast(device_type='cuda', dtype=torch.float16):
        semantic = False

-        if mode == "Interactive":
-            labeled_array, num_features = label(np.asarray(image['mask'].convert('L')))
-            spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
-
        if model_name == 'semantic-sam':
            model = model_semsam
-            output, mask = inference_semsam_m2m_auto(model, image['image'], level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
+            output, mask = inference_semsam_m2m_auto(model, image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)

        elif model_name == 'sam':
            model = model_sam
-            if mode == "Automatic":
-                output, mask = inference_sam_m2m_auto(model, image['image'], text_size, label_mode, alpha, anno_mode)
-            elif mode == "Interactive":
-                output, mask = inference_sam_m2m_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+            output, mask = inference_sam_m2m_auto(model, image, text_size, label_mode, alpha, anno_mode)

        elif model_name == 'seem':
            model = model_seem
-            if mode == "Automatic":
-                output, mask = inference_seem_pano(model, image['image'], text_size, label_mode, alpha, anno_mode)
-            elif mode == "Interactive":
-                output, mask = inference_seem_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
+            output, mask = inference_seem_pano(model, image, text_size, label_mode, alpha, anno_mode)

-        return output
+        return output, mask

 # Function to encode the image
-def encode_image(image_path):
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode('utf-8')
-
+def encode_image(image):
+    pil_img = PIL.Image.fromarray(image)
+    buff = BytesIO()
+    pil_img.save(buff, format="JPEG")
+    new_image_string = base64.b64encode(buff.getvalue()).decode("utf-8")
+    return new_image_string

 def parse_actions_from_string(input_string):
    # Search for a JSON string within the input string
@@ -156,7 +170,6 @@ def parse_actions_from_string(input_string):
            except json.JSONDecodeError as e:
                raise ValueError("Invalid response format: " + input_string)

-
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -181,7 +194,8 @@ class GPT4v_Agent:
        ]

    def predict(self, obs):
-        obs = inference(obs, slider=2.0, mode="Automatic", alpha=0.1, label_mode="Alphabet", anno_mode=["Mask", "Mark"])
+        obs, mask = inference(obs, slider=3.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"])
+        PIL.Image.fromarray(obs).save("desktop.jpeg")
        base64_image = encode_image(obs)
        self.trajectory.append({
            "role": "user",
@@ -212,14 +226,14 @@ class GPT4v_Agent:
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)

        try:
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'], mask)
        except:
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None

        return actions

-    def parse_actions(self, response: str):
+    def parse_actions(self, response: str, mask):
        # response example
        """
        ```json
@@ -232,6 +246,7 @@ class GPT4v_Agent:

        # parse from the response
        actions = parse_actions_from_string(response)
+        print(actions)

        # add action into the trajectory
        self.trajectory.append({
@@ -247,24 +262,14 @@ class GPT4v_Agent:
        # parse action
        parsed_actions = []
        for action in actions:
-            parsed_action = {}
-            action_type = Action[action['action_type']].value
-            parsed_action["action_type"] = action_type
+            action_type = action['action_type']
+            if action_type == "CLICK":
+                label = int(action['label'])
+                x, y, w, h = mask[label-1]['bbox']
+                parsed_actions.append({"action_type": action_type, "x": int(x + w//2) , "y": int(y + h//2)})

-            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-                parsed_action["click_type"] = MouseClick[action['click_type']].value
-
-            if action_type == Action.MOUSE_MOVE.value:
-                parsed_action["x"] = action["x"]
-                parsed_action["y"] = action["y"]
-
-            if action_type == Action.KEY.value:
-                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys
-
-            if action_type == Action.TYPE.value:
-                parsed_action["text"] = action["text"]
-
-            parsed_actions.append(parsed_action)
+            if action_type == "TYPE":
+                parsed_actions.append({"action_type": action_type, "text": action["text"]})

        return parsed_actions

@@ -273,5 +278,6 @@ if __name__ == '__main__':
    # OpenAI API Key
    api_key = os.environ.get("OPENAI_API_KEY")

-    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
-    print(agent.predict(obs="stackoverflow.png"))
+    agent = GPT4v_Agent(api_key=api_key, instruction="Open Firefox")
+    obs = PIL.Image.open('desktop.png')
+    print(agent.predict(obs=obs))
--- a/mm_agents/chrome_start.png
+++ b/mm_agents/chrome_start.png
--- a/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
+++ b/mm_agents/configs/seem_focall_unicl_lang_v1.yaml
@@ -0,0 +1,401 @@
+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou (xueyan@cs.wisc.edu)
+# --------------------------------------------------------
+
+# Define Test/Trainer/Saving
+PIPELINE: XDecoderPipeline
+TRAINER: xdecoder
+SAVE_DIR: '../../data/output/test'
+base_path: "./"
+
+# Resume Logistic
+RESUME: false
+WEIGHT: false
+RESUME_FROM: ''
+EVAL_AT_START: False
+
+# Logging and Debug
+WANDB: False
+LOG_EVERY: 100
+FIND_UNUSED_PARAMETERS: false
+
+# Speed up training
+FP16: false
+PORT: '36873'
+
+# misc
+LOADER:
+  JOINT: False
+  KEY_DATASET: 'coco'
+
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: seem_model_v1
+  HEAD: xdecoder_head
+  MASK_ON: false
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  DIM_PROJ: 512
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal
+    PRETRAINED: ''
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [2, 2, 18, 2]
+      FOCAL_LEVELS: [4, 4, 4, 4]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: seem_v1
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK:
+      ENABLED: True
+    DETECTION: False
+    SPATIAL:
+      ENABLED: True
+      MAX_ITER: 1
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    RETRIEVAL:
+      ENABLED: False
+    LVIS:
+      ENABLED: True
+      THRES: 0.7
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.5
+      SIM_THRES: 0.95
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    GCLASS_WEIGHT: 0.4
+    GMASK_WEIGHT: 1.0
+    GDICE_WEIGHT: 1.0
+    SCLASS_WEIGHT: 0.4
+    SMASK_WEIGHT: 1.0
+    SDICE_WEIGHT: 1.0
+    OCLASS_WEIGHT: 0.4
+    OMASK_WEIGHT: 1.0
+    ODICE_WEIGHT: 1.0
+    CLASS_WEIGHT: 2.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BBOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    CAPTION_WEIGHT: 2.0
+    COST_SPATIAL:
+      CLASS_WEIGHT: 5.0
+      MASK_WEIGHT: 2.0
+      DICE_WEIGHT: 2.0
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    MAX_SPATIAL_LEN: [512, 512, 512, 512]
+    # ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 10
+    TOP_CAPTION_LAYERS: 10
+    TOP_SPATIAL_LAYERS: 10
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.8
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+
+# Spatial sampler
+STROKE_SAMPLER:
+  MAX_CANDIDATE: 1
+  CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
+  CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
+  DILATION: 3
+  CIRCLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
+    STROKE_PROB: [0.33, 0.33, 0.33]
+  SCRIBBLE:
+    NUM_STROKES: 5
+    STROKE_PRESET: ['rand_curve', 'rand_curve_small']
+    STROKE_PROB: [0.5, 0.5]
+  POINT:
+    NUM_POINTS: 20
+  POLYGON:
+    MAX_POINTS: 9
+  EVAL:
+    MODE: 'best' # best/random/best_random
+    NEGATIVE: False
+    MAX_ITER: 20
+    IOU_ITER: 1
+    GROUNDING: False
+
+# Multi-modal Architecture, order matters
+ATTENTION_ARCH:
+  VARIABLE:
+    queries: ['object', 'grounding', 'spatial']
+    tokens: ['grounding', 'spatial']
+    memories: ['spatial']
+  SELF_ATTENTION:
+    queries:
+      object: ['queries_object']
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
+    tokens:
+      grounding: ['queries_grounding', 'tokens_grounding']
+      spatial: ['tokens_spatial']
+    memories:
+      spatial: ['memories_spatial']
+  CROSS_ATTENTION:
+    queries:
+      object: True
+      grounding: True
+      spatial: True
+    memories:
+      spatial: True
+    tokens:
+      grounding: False
+      spatial: False
+  MASKING: ['tokens_spatial', 'tokens_grounding']
+  DUPLICATION:
+    queries:
+      grounding: 'queries_object'
+      spatial: 'queries_object'
+  SPATIAL_MEMORIES: 32
+  QUERY_NUMBER: 3
+
+DATASETS:
+  TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
+  # TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
+  TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"]  # to evaluate instance and semantic performance as well
+  # TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  # TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
+  CLASS_CONCAT: false
+  SIZE_DIVISIBILITY: 32
+  PROPOSAL_FILES_TRAIN: []
+
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+
+TRAIN:
+  ASPECT_RATIO_GROUPING: true
+  BATCH_SIZE_TOTAL: 4
+  BATCH_SIZE_PER_GPU: 4
+  SHUFFLE: true
+
+TEST:
+  DETECTIONS_PER_IMAGE: 100
+  NAME: coco_eval
+  IOU_TYPE: ['bbox', 'segm']
+  USE_MULTISCALE: false
+  BATCH_SIZE_TOTAL: 8
+  MODEL_FILE: ''
+  AUG:
+    ENABLED: False
+
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 8
+  LOAD_PROPOSALS: False
+  SAMPLER_TRAIN: "TrainingSampler"
+  ASPECT_RATIO_GROUPING: True
+
+COCO:
+  INPUT:
+    MIN_SIZE_TRAIN: 800
+    MAX_SIZE_TRAIN: 1333
+    MIN_SIZE_TRAIN_SAMPLING: 'choice'
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.1
+    MAX_SCALE: 2.0
+    DATASET_MAPPER_NAME: "coco_interactive"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'coco'
+
+# Validation dataset
+ADE20K:
+  INPUT:
+    MIN_SIZE_TRAIN: 640
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 640
+    MAX_SIZE_TRAIN: 2560
+    MAX_SIZE_TEST: 2560
+    MASK_FORMAT: "polygon"
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: (640, 640)
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    FORMAT: "RGB"
+  DATASET:
+    DATASET: 'ade'
+
+SBD:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 1
+
+VOC:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+DAVIS:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+VOS:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 1
+
+REF:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+    FORMAT: "RGB"
+    SPATIAL: False
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 4
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+# Detectron2 training config for optimizer and lr scheduler
+SOLVER:
+  BASE_LR: 0.0001
+  STEPS: [0.88889, 0.96296]
+  MAX_ITER: 1
+  GAMMA: 0.1
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WARMUP_METHOD: "linear"
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+  LR_MULTIPLIER:
+    backbone: 0.1
+    lang_encoder: 0.1
+  FIX_PARAM:
+    backbone: True
+    lang_encoder: True
+    pixel_decoder: True
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 5.0 # 0.01
+    NORM_TYPE: 2.0
+  MAX_NUM_EPOCHS: 50
--- a/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
+++ b/mm_agents/configs/semantic_sam_only_sa-1b_swinL.yaml
@@ -0,0 +1,524 @@
+# ------------------------------------------------------------------------
+# Semantic SAM
+# Copyright (c) MicroSoft, Inc. and its affiliates.
+# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
+# ------------------------------------------------------------------------
+
+##################
+# Task settings
+##################
+WEIGHT: ''
+PORT: 53711
+VERBOSE: true
+
+OUTPUT_DIR: '../../data/output/test'
+# misc
+LOADER:
+  JOINT: True
+  KEY_DATASET: 'coco'
+# model
+MODEL:
+  NAME: interactive_mask_dino
+  HEAD: general_head
+  MASK_ON: false
+  KEYPOINT_ON: false
+  LOAD_PROPOSALS: false
+  DIM_PROJ: 512
+  BACKBONE_DIM: 768
+  BACKGROUND: False
+  WEIGHTS: ''
+  TEXT:
+    ARCH: noencoder  # no language encoder for training only sa-1b data
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 18 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: swin
+    PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
+    LOAD_PRETRAINED: true
+    SWIN:
+      PRETRAIN_IMG_SIZE: 384
+      PATCH_SIZE: 4
+      EMBED_DIM: 192
+      DEPTHS: [ 2, 2, 18, 2 ]
+      NUM_HEADS: [ 6, 12, 24, 48 ]
+      WINDOW_SIZE: 12
+      MLP_RATIO: 4.0
+      QKV_BIAS: true
+      QK_SCALE: ~
+      DROP_RATE: 0.0
+      ATTN_DROP_RATE: 0.0
+      DROP_PATH_RATE: 0.3
+      APE: false
+      PATCH_NORM: true
+      USE_CHECKPOINT: false
+      OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
+  ENCODER:
+    NAME: encoder_deform
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 1
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 256
+    MASK_DIM: 256
+    NORM: "GN"
+    IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    NUM_FEATURE_LEVELS: 3
+    FEATURE_ORDER: "low2high"
+  DECODER:
+    NAME: interactive_mask_dino
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK: True
+    BOX: True
+    PART: True
+    GROUNDING:
+      ENABLED: False
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    CAPTION:
+      ENABLED: False
+      PHRASE_PROB: 0.0
+      SIM_THRES: 0.95
+    CAPTIONING:
+      ENABLED: False
+      STEP: 50
+    RETRIEVAL:
+      ENABLED: False
+      DIM_IMG: 768
+      ENSEMBLE: True
+    OPENIMAGE:
+      ENABLED: False
+      NEGATIVE_SAMPLES: 5
+      GROUNDING:
+        ENABLED: False
+        MAX_LEN: 5
+    DEEP_SUPERVISION: True
+    NO_OBJECT_WEIGHT: 0.1
+    CLASS_WEIGHT: 4.0
+    MASK_WEIGHT: 5.0
+    DICE_WEIGHT: 5.0
+    BOX_WEIGHT: 5.0
+    GIOU_WEIGHT: 2.0
+    IOU_WEIGHT: 1.0
+    COST_CLASS_WEIGHT: 4.0
+    COST_DICE_WEIGHT: 5.0
+    COST_MASK_WEIGHT: 5.0
+    COST_BOX_WEIGHT: 5.0
+    COST_GIOU_WEIGHT: 2.0
+    HIDDEN_DIM: 256
+    NUM_OBJECT_QUERIES: 0
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    ENC_LAYERS: 0
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    DEC_LAYERS: 9  # 9 decoder layers, add one for the loss on learnable query
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    TWO_STAGE: False
+    INITIALIZE_BOX_TYPE: 'no'
+    DN: seg
+    DN_NOISE_SCALE: 0.4
+    DN_NUM: 100
+    INITIAL_PRED: False
+    LEARN_TGT: False
+    TOTAL_NUM_FEATURE_LEVELS: 4
+    SEMANTIC_CE_LOSS: False
+    PANO_BOX_LOSS: False
+    COCO: False
+    O365: False
+    SAM: True
+    PASCAL: False
+    RE_POINT: True
+    NUM_INTERACTIVE_TOKENS: 6
+    MAX_NUM_INSTANCE: 60
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      BOX_INTERACTIVE: False
+      CLASSIFICATION_ON: False
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.25
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+      TEST_FOUCUS_ON_BOX: False
+      PANO_TRANSFORM_EVAL: True
+      PANO_TEMPERATURE: 0.06
+
+TEST:
+  EVAL_PERIOD: 500000
+  PRECISE_BN:
+    NUM_ITER: 1
+    ENABLED: False
+  AUG:
+    ENABLED: False
+
+SAM:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.99
+    MAX_SCALE: 1.01
+    DATASET_MAPPER_NAME: "sam"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'sam'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 4
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+COCO:
+  INPUT:
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+    IMAGE_SIZE: 1024
+    MIN_SCALE: 0.1
+    MAX_SCALE: 2.0
+    DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    RANDOM_FLIP: "horizontal"
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  DATASET:
+    DATASET: 'coco'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 1
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 2
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+VLP:
+  INPUT:
+    IMAGE_SIZE: 224
+    DATASET_MAPPER_NAME: "vlpretrain"
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: False
+    SIZE_DIVISIBILITY: 32
+    MASK_FORMAT: "polygon"
+    FORMAT: "RGB"
+    CROP:
+      ENABLED: True
+  TRAIN:
+    BATCH_SIZE_TOTAL: 2
+    BATCH_SIZE_PER_GPU: 2
+  TEST:
+    BATCH_SIZE_TOTAL: 256
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 16
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]
+
+DATASETS:
+  TRAIN: ["sam_train"]
+  # interactive segmentation evaluation.
+  TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
+#  TEST: ["sam_minival"]
+
+  CLASS_CONCAT: false
+  SIZE_DIVISIBILITY: 32
+  PROPOSAL_FILES_TRAIN: []
+
+DATALOADER:
+  FILTER_EMPTY_ANNOTATIONS: False
+  NUM_WORKERS: 16
+  LOAD_PROPOSALS: False
+  SAMPLER_TRAIN: "TrainingSampler"
+  ASPECT_RATIO_GROUPING: True
+
+# Detectron2 training config for optimizer and lr scheduler
+SOLVER:
+  BASE_LR_END: 0.0
+  MOMENTUM: 0.9
+  NESTEROV: False
+  CHECKPOINT_PERIOD: 5000
+  IMS_PER_BATCH: 1
+  REFERENCE_WORLD_SIZE: 0
+  BIAS_LR_FACTOR: 1.0
+  WEIGHT_DECAY_BIAS: None
+  # original
+  BASE_LR: 0.0001
+  STEPS: [327778, 355092]
+  MAX_ITER: 368750
+  GAMMA: 0.1
+  WARMUP_FACTOR: 1.0
+  WARMUP_ITERS: 10
+  WARMUP_METHOD: "linear"
+  WEIGHT_DECAY: 0.05
+  OPTIMIZER: "ADAMW"
+  LR_SCHEDULER_NAME: "WarmupMultiStepLR"
+  LR_MULTIPLIER:
+    backbone: 0.1
+    lang_encoder: 0.1
+  WEIGHT_DECAY_NORM: 0.0
+  WEIGHT_DECAY_EMBED: 0.0
+  CLIP_GRADIENTS:
+    ENABLED: True
+    CLIP_TYPE: "full_model"
+    CLIP_VALUE: 0.01
+    NORM_TYPE: 2.0
+  AMP:
+    ENABLED: True
+
+# Evaluation Dataset
+ADE20K:
+  INPUT:
+    MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 640
+    MAX_SIZE_TRAIN: 2560
+    MAX_SIZE_TEST: 2560
+    MASK_FORMAT: "polygon"
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: [640, 640]
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    FORMAT: "RGB"
+  DATASET:
+    DATASET: 'ade'
+  TRAIN:
+    ASPECT_RATIO_GROUPING: true
+    BATCH_SIZE_TOTAL: 16
+    BATCH_SIZE_PER_GPU: 2
+    SHUFFLE: true
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 8
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
+#ADE20K:
+#  INPUT:
+#    MIN_SIZE_TRAIN: 640
+#    MIN_SIZE_TRAIN_SAMPLING: "choice"
+#    MIN_SIZE_TEST: 640
+#    MAX_SIZE_TRAIN: 2560
+#    MAX_SIZE_TEST: 2560
+#    MASK_FORMAT: "polygon"
+#    CROP:
+#      ENABLED: True
+#      TYPE: "absolute"
+#      SIZE: (640, 640)
+#      SINGLE_CATEGORY_MAX_AREA: 1.0
+#    COLOR_AUG_SSD: True
+#    SIZE_DIVISIBILITY: 640  # used in dataset mapper
+#    DATASET_MAPPER_NAME: "mask_former_panoptic"
+#    FORMAT: "RGB"
+#  DATASET:
+#    DATASET: 'ade'
+#  TEST:
+#    BATCH_SIZE_TOTAL: 8
+
+
+REF:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+    FORMAT: "RGB"
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+SUN:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+SCAN:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 512
+    MAX_SIZE_TEST: 1024
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+BDD:
+  INPUT:
+    PIXEL_MEAN: [123.675, 116.280, 103.530]
+    PIXEL_STD: [58.395, 57.120, 57.375]
+    MIN_SIZE_TEST: 800
+    MAX_SIZE_TEST: 1333
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 0
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: False
+  TEST:
+    BATCH_SIZE_TOTAL: 8
+
+CITY:
+  INPUT:
+    MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
+    MIN_SIZE_TRAIN_SAMPLING: "choice"
+    MIN_SIZE_TEST: 1024
+    MAX_SIZE_TRAIN: 4096
+    MAX_SIZE_TEST: 2048
+    CROP:
+      ENABLED: True
+      TYPE: "absolute"
+      SIZE: [ 512, 1024 ]
+      SINGLE_CATEGORY_MAX_AREA: 1.0
+    IGNORE_VALUE: 255
+    COLOR_AUG_SSD: True
+    SIZE_DIVISIBILITY: -1
+    FORMAT: "RGB"
+    DATASET_MAPPER_NAME: "mask_former_panoptic"
+    MASK_FORMAT: "polygon"
+    TEST:
+      EVAL_PERIOD: 5000
+      BATCH_SIZE_TOTAL: 1
+      AUG:
+        ENABLED: False
+        MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
+        MAX_SIZE: 4096
+        FLIP: True
+    DATALOADER:
+      FILTER_EMPTY_ANNOTATIONS: True
+      NUM_WORKERS: 2
+      LOAD_PROPOSALS: False
+      SAMPLER_TRAIN: "TrainingSampler"
+      ASPECT_RATIO_GROUPING: True
+    TRAIN:
+      ASPECT_RATIO_GROUPING: true
+      BATCH_SIZE_TOTAL: 2
+      BATCH_SIZE_PER_GPU: 2
+      SHUFFLE: true
+
+PSACAL_PART:
+  INPUT:
+      MIN_SIZE_TEST: 800
+      MAX_SIZE_TEST: 1333
+      IMAGE_SIZE: 1024
+      MIN_SCALE: 0.1
+      MAX_SCALE: 2.0
+      DATASET_MAPPER_NAME: "pascal_part_lsj"
+      IGNORE_VALUE: 255
+      COLOR_AUG_SSD: False
+      SIZE_DIVISIBILITY: 32
+      RANDOM_FLIP: "horizontal"
+      MASK_FORMAT: "polygon"
+      FORMAT: "RGB"
+      CROP:
+        ENABLED: True
+  MODEL:
+    MASK_ON: True
+    KEYPOINT_ON: False
+    LOAD_PROPOSALS: False
+  # DATASET:
+  #   DATASET: 'coco'
+  TEST:
+    DETECTIONS_PER_IMAGE: 100
+    NAME: coco_eval
+    IOU_TYPE: ['bbox', 'segm']
+    USE_MULTISCALE: false
+    BATCH_SIZE_TOTAL: 8
+    MODEL_FILE: ''
+    AUG:
+      ENABLED: False
+  TRAIN:
+    BATCH_SIZE_TOTAL: 1
+    BATCH_SIZE_PER_GPU: 1
+    SHUFFLE: true
+  DATALOADER:
+    FILTER_EMPTY_ANNOTATIONS: False
+    NUM_WORKERS: 2
+    LOAD_PROPOSALS: False
+    SAMPLER_TRAIN: "TrainingSampler"
+    ASPECT_RATIO_GROUPING: True
--- a/mm_agents/desktop.png
+++ b/mm_agents/desktop.png
--- a/mm_agents/download_ckpt.sh
+++ b/mm_agents/download_ckpt.sh
@@ -0,0 +1,3 @@
+wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth
+wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt
+wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth
--- a/mm_agents/gemini_test.py
+++ b/mm_agents/gemini_test.py
@@ -0,0 +1,19 @@
+import PIL.Image
+import google.generativeai as genai
+
+genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q")
+
+# for m in genai.list_models():
+#   if 'generateContent' in m.supported_generation_methods:
+#     print(m.name)
+
+model = genai.GenerativeModel('gemini-pro-vision')
+
+img = PIL.Image.open('image.jpg')
+
+messages = [
+    {'role':'user',
+     'parts': ["Explain this image.", img]}
+]
+
+response = model.generate_content(messages)
--- a/mm_agents/ops/functions/init.py
+++ b/mm_agents/ops/functions/init.py
@@ -0,0 +1,13 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from .ms_deform_attn_func import MSDeformAttnFunction
+
--- a/mm_agents/ops/functions/ms_deform_attn_func.py
+++ b/mm_agents/ops/functions/ms_deform_attn_func.py
@@ -0,0 +1,72 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import torch
+import torch.nn.functional as F
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+try:
+    import MultiScaleDeformableAttention as MSDA
+except ModuleNotFoundError as e:
+    info_string = (
+        "\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
+        "\t`cd mask2former/modeling/pixel_decoder/ops`\n"
+        "\t`sh make.sh`\n"
+    )
+    raise ModuleNotFoundError(info_string)
+
+
+class MSDeformAttnFunction(Function):
+    @staticmethod
+    def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
+        ctx.im2col_step = im2col_step
+        output = MSDA.ms_deform_attn_forward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
+        ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(ctx, grad_output):
+        value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = \
+            MSDA.ms_deform_attn_backward(
+                value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
+    # for debug and test only,
+    # need to use cuda version instead
+    N_, S_, M_, D_ = value.shape
+    _, Lq_, M_, L_, P_, _ = sampling_locations.shape
+    value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for lid_, (H_, W_) in enumerate(value_spatial_shapes):
+        # N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
+        value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
+        # N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
+        # N_*M_, D_, Lq_, P_
+        sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
+                                          mode='bilinear', padding_mode='zeros', align_corners=False)
+        sampling_value_list.append(sampling_value_l_)
+    # (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
+    attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
+    output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
+    return output.transpose(1, 2).contiguous()
--- a/mm_agents/ops/make.sh
+++ b/mm_agents/ops/make.sh
@@ -0,0 +1,13 @@
+#!/usr/bin/env bash
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+python setup.py build install
--- a/mm_agents/ops/modules/init.py
+++ b/mm_agents/ops/modules/init.py
@@ -0,0 +1,12 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from .ms_deform_attn import MSDeformAttn
--- a/mm_agents/ops/modules/ms_deform_attn.py
+++ b/mm_agents/ops/modules/ms_deform_attn.py
@@ -0,0 +1,125 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import warnings
+import math
+
+import torch
+from torch import nn
+import torch.nn.functional as F
+from torch.nn.init import xavier_uniform_, constant_
+
+from ..functions import MSDeformAttnFunction
+from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
+
+
+def _is_power_of_2(n):
+    if (not isinstance(n, int)) or (n < 0):
+        raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
+    return (n & (n-1) == 0) and n != 0
+
+
+class MSDeformAttn(nn.Module):
+    def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
+        """
+        Multi-Scale Deformable Attention Module
+        :param d_model      hidden dimension
+        :param n_levels     number of feature levels
+        :param n_heads      number of attention heads
+        :param n_points     number of sampling points per attention head per feature level
+        """
+        super().__init__()
+        if d_model % n_heads != 0:
+            raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
+        _d_per_head = d_model // n_heads
+        # you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
+        if not _is_power_of_2(_d_per_head):
+            warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
+                          "which is more efficient in our CUDA implementation.")
+
+        self.im2col_step = 128
+
+        self.d_model = d_model
+        self.n_levels = n_levels
+        self.n_heads = n_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
+        self.value_proj = nn.Linear(d_model, d_model)
+        self.output_proj = nn.Linear(d_model, d_model)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        constant_(self.sampling_offsets.weight.data, 0.)
+        thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        constant_(self.attention_weights.weight.data, 0.)
+        constant_(self.attention_weights.bias.data, 0.)
+        xavier_uniform_(self.value_proj.weight.data)
+        constant_(self.value_proj.bias.data, 0.)
+        xavier_uniform_(self.output_proj.weight.data)
+        constant_(self.output_proj.bias.data, 0.)
+
+    def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
+        """
+        :param query                       (N, Length_{query}, C)
+        :param reference_points            (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
+                                        or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
+        :param input_flatten               (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
+        :param input_spatial_shapes        (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+        :param input_level_start_index     (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
+        :param input_padding_mask          (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
+
+        :return output                     (N, Length_{query}, C)
+        """
+        N, Len_q, _ = query.shape
+        N, Len_in, _ = input_flatten.shape
+        assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
+
+        value = self.value_proj(input_flatten)
+        if input_padding_mask is not None:
+            value = value.masked_fill(input_padding_mask[..., None], float(0))
+        value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
+        attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
+        attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
+        # N, Len_q, n_heads, n_levels, n_points, 2
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
+            sampling_locations = reference_points[:, :, None, :, None, :] \
+                                 + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+        elif reference_points.shape[-1] == 4:
+            sampling_locations = reference_points[:, :, None, :, None, :2] \
+                                 + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+        else:
+            raise ValueError(
+                'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
+        try:
+            output = MSDeformAttnFunction.apply(
+                value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
+        except:
+            # CPU
+            output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        # # For FLOPs calculation only
+        # output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+        return output
--- a/mm_agents/ops/setup.py
+++ b/mm_agents/ops/setup.py
@@ -0,0 +1,78 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+import os
+import glob
+
+import torch
+
+from torch.utils.cpp_extension import CUDA_HOME
+from torch.utils.cpp_extension import CppExtension
+from torch.utils.cpp_extension import CUDAExtension
+
+from setuptools import find_packages
+from setuptools import setup
+
+requirements = ["torch", "torchvision"]
+
+def get_extensions():
+    this_dir = os.path.dirname(os.path.abspath(__file__))
+    extensions_dir = os.path.join(this_dir, "src")
+
+    main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
+    source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
+    source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
+
+    sources = main_file + source_cpu
+    extension = CppExtension
+    extra_compile_args = {"cxx": []}
+    define_macros = []
+
+    # Force cuda since torch ask for a device, not if cuda is in fact available.
+    if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
+        extension = CUDAExtension
+        sources += source_cuda
+        define_macros += [("WITH_CUDA", None)]
+        extra_compile_args["nvcc"] = [
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ]
+    else:
+        if CUDA_HOME is None:
+            raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
+        else:
+            raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
+
+    sources = [os.path.join(extensions_dir, s) for s in sources]
+    include_dirs = [extensions_dir]
+    ext_modules = [
+        extension(
+            "MultiScaleDeformableAttention",
+            sources,
+            include_dirs=include_dirs,
+            define_macros=define_macros,
+            extra_compile_args=extra_compile_args,
+        )
+    ]
+    return ext_modules
+
+setup(
+    name="MultiScaleDeformableAttention",
+    version="1.0",
+    author="Weijie Su",
+    url="https://github.com/fundamentalvision/Deformable-DETR",
+    description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
+    packages=find_packages(exclude=("configs", "tests",)),
+    ext_modules=get_extensions(),
+    cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
+)
--- a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp
+++ b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.cpp
@@ -0,0 +1,46 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include <vector>
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    AT_ERROR("Not implement on cpu");
+}
+
--- a/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h
+++ b/mm_agents/ops/src/cpu/ms_deform_attn_cpu.h
@@ -0,0 +1,38 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor
+ms_deform_attn_cpu_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor>
+ms_deform_attn_cpu_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
+
--- a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu
+++ b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.cu
@@ -0,0 +1,158 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include <vector>
+#include "cuda/ms_deform_im2col_cuda.cuh"
+
+#include <ATen/ATen.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+    
+    auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
+
+    const int batch_n = im2col_step_;
+    auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto columns = output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
+            ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
+                value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                spatial_shapes.data<int64_t>(),
+                level_start_index.data<int64_t>(),
+                sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                columns.data<scalar_t>());
+
+        }));
+    }
+
+    output = output.view({batch, num_query, num_heads*channels});
+
+    return output;
+}
+
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+
+    AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
+    AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
+    AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
+    AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
+    AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
+    AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
+
+    AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
+    AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
+    AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
+    AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
+    AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
+    AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
+
+    const int batch = value.size(0);
+    const int spatial_size = value.size(1);
+    const int num_heads = value.size(2);
+    const int channels = value.size(3);
+
+    const int num_levels = spatial_shapes.size(0);
+
+    const int num_query = sampling_loc.size(1);
+    const int num_point = sampling_loc.size(4);
+
+    const int im2col_step_ = std::min(batch, im2col_step);
+
+    AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
+
+    auto grad_value = at::zeros_like(value);
+    auto grad_sampling_loc = at::zeros_like(sampling_loc);
+    auto grad_attn_weight = at::zeros_like(attn_weight);
+
+    const int batch_n = im2col_step_;
+    auto per_value_size = spatial_size * num_heads * channels;
+    auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
+    auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
+    auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
+    
+    for (int n = 0; n < batch/im2col_step_; ++n)
+    {
+        auto grad_output_g = grad_output_n.select(0, n);
+        AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
+            ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
+                                    grad_output_g.data<scalar_t>(),
+                                    value.data<scalar_t>() + n * im2col_step_ * per_value_size,
+                                    spatial_shapes.data<int64_t>(),
+                                    level_start_index.data<int64_t>(),
+                                    sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
+                                    batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
+                                    grad_value.data<scalar_t>() +  n * im2col_step_ * per_value_size,
+                                    grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
+                                    grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
+
+        }));
+    }
+
+    return {
+        grad_value, grad_sampling_loc, grad_attn_weight
+    };
+}
--- a/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h
+++ b/mm_agents/ops/src/cuda/ms_deform_attn_cuda.h
@@ -0,0 +1,35 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+#include <torch/extension.h>
+
+at::Tensor ms_deform_attn_cuda_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step);
+
+std::vector<at::Tensor> ms_deform_attn_cuda_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step);
+
--- a/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh
+++ b/mm_agents/ops/src/cuda/ms_deform_im2col_cuda.cuh
--- a/mm_agents/ops/src/ms_deform_attn.h
+++ b/mm_agents/ops/src/ms_deform_attn.h
@@ -0,0 +1,67 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#pragma once
+
+#include "cpu/ms_deform_attn_cpu.h"
+
+#ifdef WITH_CUDA
+#include "cuda/ms_deform_attn_cuda.h"
+#endif
+
+
+at::Tensor
+ms_deform_attn_forward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_forward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
+std::vector<at::Tensor>
+ms_deform_attn_backward(
+    const at::Tensor &value, 
+    const at::Tensor &spatial_shapes,
+    const at::Tensor &level_start_index,
+    const at::Tensor &sampling_loc,
+    const at::Tensor &attn_weight,
+    const at::Tensor &grad_output,
+    const int im2col_step)
+{
+    if (value.type().is_cuda())
+    {
+#ifdef WITH_CUDA
+        return ms_deform_attn_cuda_backward(
+            value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
+#else
+        AT_ERROR("Not compiled with GPU support");
+#endif
+    }
+    AT_ERROR("Not implemented on the CPU");
+}
+
--- a/mm_agents/ops/src/vision.cpp
+++ b/mm_agents/ops/src/vision.cpp
@@ -0,0 +1,21 @@
+/*!
+**************************************************************************************************
+* Deformable DETR
+* Copyright (c) 2020 SenseTime. All Rights Reserved.
+* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+**************************************************************************************************
+* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+**************************************************************************************************
+*/
+
+/*!
+* Copyright (c) Facebook, Inc. and its affiliates.
+* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+*/
+
+#include "ms_deform_attn.h"
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
+  m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
+}
--- a/mm_agents/ops/test.py
+++ b/mm_agents/ops/test.py
@@ -0,0 +1,92 @@
+# ------------------------------------------------------------------------------------------------
+# Deformable DETR
+# Copyright (c) 2020 SenseTime. All Rights Reserved.
+# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
+# ------------------------------------------------------------------------------------------------
+# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
+# ------------------------------------------------------------------------------------------------
+
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
+
+from __future__ import absolute_import
+from __future__ import print_function
+from __future__ import division
+
+import time
+import torch
+import torch.nn as nn
+from torch.autograd import gradcheck
+
+from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
+
+
+N, M, D = 1, 2, 2
+Lq, L, P = 2, 2, 2
+shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
+level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
+S = sum([(H*W).item() for H, W in shapes])
+
+
+torch.manual_seed(3)
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_double():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+@torch.no_grad()
+def check_forward_equal_with_pytorch_float():
+    value = torch.rand(N, S, M, D).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
+    output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
+    fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
+    max_abs_err = (output_cuda - output_pytorch).abs().max()
+    max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
+
+    print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
+
+
+def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
+
+    value = torch.rand(N, S, M, channels).cuda() * 0.01
+    sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
+    attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
+    attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
+    im2col_step = 2
+    func = MSDeformAttnFunction.apply
+
+    value.requires_grad = grad_value
+    sampling_locations.requires_grad = grad_sampling_loc
+    attention_weights.requires_grad = grad_attn_weight
+
+    gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
+
+    print(f'* {gradok} check_gradient_numerical(D={channels})')
+
+
+if __name__ == '__main__':
+    check_forward_equal_with_pytorch_double()
+    check_forward_equal_with_pytorch_float()
+
+    for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
+        check_gradient_numerical(channels, True, True, True)
+
+
+
--- a/mm_agents/stackoverflow.png
+++ b/mm_agents/stackoverflow.png
--- a/mm_agents/task_adapter/sam/init.py
+++ b/mm_agents/task_adapter/sam/init.py
--- a/mm_agents/task_adapter/sam/tasks/Init.py
+++ b/mm_agents/task_adapter/sam/tasks/Init.py
@@ -0,0 +1,2 @@
+from .inference_sam_m2m_auto import *
+from .inference_sam_m2m_interactive import *
--- a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_auto.py
+++ b/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_auto.py
@@ -0,0 +1,103 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from segment_anything import SamAutomaticMaskGenerator
+
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+
+def inference_sam_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+    image_ori = np.asarray(image_ori)
+
+    mask_generator = SamAutomaticMaskGenerator(model)
+    outputs = mask_generator.generate(image_ori)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
+++ b/mm_agents/task_adapter/sam/tasks/inference_sam_m2m_interactive.py
@@ -0,0 +1,221 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from kornia.contrib import distance_transform
+import matplotlib.pyplot as plt
+import cv2
+import io
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything import SamAutomaticMaskGenerator
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+def sam_interactive_mask(mask_generator, points, in_points, in_labels, mask_input):
+    masks, iou_preds, _ = mask_generator.predictor.predict_torch(
+            in_points,
+            in_labels,
+            mask_input=mask_input,
+            multimask_output=True,
+            return_logits=True,
+    )
+    nm,_,h,w = masks.shape
+
+    # Serialize predictions and store in MaskData
+    data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+    )
+    del masks
+
+    # Calculate stability score
+    data["stability_score"] = calculate_stability_score(
+            data["masks"], mask_generator.predictor.model.mask_threshold, mask_generator.stability_score_offset
+    )
+
+    masks = data["masks"].reshape(nm, -1, h, w)
+    scores = (data['iou_preds'] + data['stability_score']).reshape(nm, -1)
+
+    index = torch.stack([torch.arange(nm).cuda(), scores.argmax(dim=1)]).tolist()
+    return masks[index]
+
+def inference_sam_m2m_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    spatial_masks = spatial_masks[:, None].float().cuda()
+    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
+
+    # generate single center point
+    # n,_,h,w = spatial_masks.shape
+    # mask_dt = (distance_transform((~F.pad(spatial_masks, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
+    # max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
+    # next_mask = torch.zeros(spatial_masks.shape, device=torch.cuda.current_device()).bool()
+    # next_mask = next_mask.view(n,-1)
+    # next_mask[max_xy_idx] = True
+    # next_mask = next_mask.reshape((n,1,h,w))
+    # points = next_mask.nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
+
+    # stack sampled points
+    acc_points = []
+    for i in range(len(spatial_masks)):
+        points = spatial_masks[i:i+1].nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
+        rand_ids = np.random.choice(points.shape[0], size=40, replace=True)
+        points = points[rand_ids]
+        acc_points.append(points)
+    _np = len(acc_points)
+    points = np.concatenate(acc_points)
+
+    mask_generator = SamAutomaticMaskGenerator(model)
+    mask_generator.predictor.set_image(image_ori)
+    im_size = image_ori.shape[:-1]
+
+    transformed_points = mask_generator.predictor.transform.apply_coords(points, im_size)
+    in_points = torch.as_tensor(transformed_points, device=mask_generator.predictor.device).reshape(_np,-1,2).transpose(0,1)
+    in_labels = torch.ones((in_points.shape[0], _np), dtype=torch.int, device=mask_generator.predictor.device)
+
+    masks = sam_interactive_mask(mask_generator, points, in_points.transpose(0,1), in_labels.transpose(0,1), None)
+
+    masks = masks > 0.0
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/seem/init.py
+++ b/mm_agents/task_adapter/seem/init.py
--- a/mm_agents/task_adapter/seem/tasks/init.py
+++ b/mm_agents/task_adapter/seem/tasks/init.py
@@ -0,0 +1,3 @@
+from .interactive_seem_m2m_auto import *
+from .inference_seem_pano import *
+from .inference_seem_interactive import *
--- a/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
+++ b/mm_agents/task_adapter/seem/tasks/automatic_mask_generator.py
@@ -0,0 +1,382 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+
+from segment_anything.modeling import Sam
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+class SeemAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model: Sam,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.9,
+        stability_score_thresh: float = 0.5,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = "binary_mask",
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crop_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crop_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+        # dilate conv
+        self.dilation = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=7, stride=1, padding=3, bias=False)
+        self.dilation.weight.data.fill_(1.0)
+        self.dilation.cuda()
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[-2:]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros_like(data["boxes"][:, 0]),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image#[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[-2:]
+        # self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        self.enc_features=None
+
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(cropped_im, points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros(len(data["boxes"])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        images,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        data = {"image": images, "height": orig_h, "width": orig_w}
+        points = torch.tensor(points,dtype=torch.float).to(images.device)
+        
+        # prepare interactive mask for seem
+        abs_points = (points * torch.tensor(orig_size)[None,:].to(points.device)).long()
+        abs_masks = torch.zeros((len(points), orig_h, orig_w), dtype=torch.bool).to(device=points.device)
+        abs_masks[torch.arange(0, abs_points.size(0))[:,None], abs_points[:,0:1], abs_points[:,1:2]] = True
+        abs_masks = self.dilation(abs_masks[:,None].float())[:,0] > 0
+        data['spatial_query'] = {'rand_shape': abs_masks[:,None]}
+
+        batch_inputs = [data]
+        if self.enc_features is None:
+            masks, iou_preds, mask_features, transformer_encoder_features, multi_scale_features = self.predictor.model.evaluate_demo(batch_inputs, None, None, return_features=True)
+            self.enc_features = (mask_features, transformer_encoder_features, multi_scale_features)
+        else:
+            masks, iou_preds = self.predictor.model.evaluate_demo(batch_inputs, self.enc_features[0], self.enc_features[1], self.enc_features[2])
+
+        data = MaskData(
+            masks=masks,
+            iou_preds=iou_preds,
+            points=points,
+        )
+        del masks
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data["iou_preds"] > self.pred_iou_thresh
+            data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], 0.0, self.stability_score_offset
+        )
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data["stability_score"] >= self.stability_score_thresh
+            data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > 0.0
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros_like(boxes[:, 0]),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
+++ b/mm_agents/task_adapter/seem/tasks/inference_seem_interactive.py
@@ -0,0 +1,169 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+
+    spatial_masks = spatial_masks[:, None].float().cuda()
+    spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
+    data['spatial_query'] = {'rand_shape': spatial_masks}
+
+    model.model.metadata = metadata
+    masks, _ = model.model.evaluate_demo([data])
+    masks = masks > 0.0
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
+++ b/mm_agents/task_adapter/seem/tasks/inference_seem_pano.py
@@ -0,0 +1,164 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+from segment_anything.utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def inference_seem_pano(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    orig_size = images.shape[-2:]
+    orig_h, orig_w = orig_size
+    crop_box = [0,0,orig_w,orig_h]
+
+    data = {"image": images, "height": orig_h, "width": orig_w}
+    batch_inputs = [data]
+
+    model.model.metadata = metadata
+    outputs = model.model.evaluate(batch_inputs)
+
+    pano_mask = outputs[0]['panoptic_seg'][0]
+    pano_info = outputs[0]['panoptic_seg'][1]
+
+    masks = []
+    for seg_info in pano_info:
+        masks += [pano_mask == seg_info['id']]
+    masks = torch.stack(masks, dim=0)
+    iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
+    points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
+
+    mask_data = MaskData(
+        masks=masks,
+        iou_preds=iou_preds,
+        points=points,
+    )
+    mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
+    del masks
+
+    mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
+    mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
+
+    # Compress to RLE
+    mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
+    mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
+    del mask_data["masks"]
+    mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+
+    # Write mask records
+    outputs = []
+    for idx in range(len(mask_data["segmentations"])):
+        ann = {
+            "segmentation": mask_data["segmentations"][idx],
+            "area": area_from_rle(mask_data["rles"][idx]),
+            "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+            "predicted_iou": mask_data["iou_preds"][idx].item(),
+            "point_coords": [mask_data["points"][idx].tolist()],
+            "stability_score": mask_data["stability_score"][idx].item(),
+            "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+        }
+        outputs.append(ann)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    # create a full zero image as the image_orig
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
+++ b/mm_agents/task_adapter/seem/tasks/interactive_seem_m2m_auto.py
@@ -0,0 +1,93 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SeemAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_seem_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    mask_generator = SeemAutomaticMaskGenerator(model)
+    outputs = mask_generator.generate(images)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    for ann in sorted_anns:
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        label += 1
+    im = demo.get_image()
+
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/semantic_sam/tasks/init.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/init.py
@@ -0,0 +1,6 @@
+from .interactive_idino_m2m import interactive_infer_image as interactive_infer_image_idino_m2m
+from .interactive_idino_m2m import interactive_infer_image_semantic, interactive_infer_image_3l
+from .inference_semsam_m2m_auto import inference_semsam_m2m_auto
+from .interactive_idino_1o1_box import interactive_infer_image_box as interactive_infer_image_idino_m2m_box
+from .automatic_mask_generator import prompt_switch
+from .interactive_predictor import SemanticSAMPredictor
--- a/mm_agents/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/automatic_mask_generator.py
@@ -0,0 +1,393 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import numpy as np
+import torch
+from torchvision.ops.boxes import batched_nms, box_area  # type: ignore
+
+from typing import Any, Dict, List, Optional, Tuple
+# from
+# from .modeling import Sam
+# from .predictor import SamPredictor
+from semantic_sam.utils.sam_utils.amg import (
+    MaskData,
+    area_from_rle,
+    batch_iterator,
+    batched_mask_to_box,
+    box_xyxy_to_xywh,
+    build_all_layer_point_grids,
+    calculate_stability_score,
+    coco_encode_rle,
+    generate_crop_boxes,
+    is_box_near_crop_edge,
+    mask_to_rle_pytorch,
+    remove_small_regions,
+    rle_to_mask,
+    uncrop_boxes_xyxy,
+    uncrop_masks,
+    uncrop_points,
+)
+
+
+def prompt_switch(p):
+    p = int(p)
+    if p == 1:
+        return 3
+    if p == 2:
+        return 2
+    if p == 3:
+        return 0
+    if p == 4:
+        return 4
+    if p == 5:
+        return 1
+    if p == 6:
+        return 5
+    else:
+        raise NotImplementedError
+
+
+class SemanticSamAutomaticMaskGenerator:
+    def __init__(
+        self,
+        model,
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 200,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.92,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 10,
+        output_mode: str = "binary_mask",
+        level: list = [1, 2, 3, 4, 5, 6],
+    ) -> None:
+        """
+        Using a SAM model, generates masks for the entire image.
+        Generates a grid of point prompts over the image, then filters
+        low quality and duplicate masks. The default settings are chosen
+        for SAM with a ViT-H backbone.
+
+        Arguments:
+          model (Sam): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crops_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crops_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """
+        self.level = [prompt_switch(l) for l in level]
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), "Exactly one of points_per_side or point_grid must be provided."
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError("Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            "binary_mask",
+            "uncompressed_rle",
+            "coco_rle",
+        ], f"Unknown output_mode {output_mode}."
+        if output_mode == "coco_rle":
+            from pycocotools import mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = model
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """
+        Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+        # Encode masks
+        if self.output_mode == "coco_rle":
+            mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
+        elif self.output_mode == "binary_mask":
+            mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
+        else:
+            mask_data["segmentations"] = mask_data["rles"]
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data["segmentations"])):
+            ann = {
+                "segmentation": mask_data["segmentations"][idx],
+                "area": area_from_rle(mask_data["rles"][idx]),
+                "bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
+                "predicted_iou": mask_data["iou_preds"][idx].item(),
+                "point_coords": [mask_data["points"][idx].tolist()],
+                "stability_score": mask_data["stability_score"][idx].item(),
+                "crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[-2:]
+        crop_boxes, layer_idxs = generate_crop_boxes(
+            orig_size, self.crop_n_layers, self.crop_overlap_ratio
+        )
+
+        # Iterate over image crops
+        assert len(crop_boxes)==1
+        data = MaskData()
+        # import ipdb; ipdb.set_trace()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
+
+            data.cat(crop_data)
+        # import ipdb; ipdb.set_trace()
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data["crop_boxes"])
+            scores = scores.to(data["boxes"].device)
+            keep_by_nms = batched_nms(
+                data["boxes"].float(),
+                scores,
+                torch.zeros(len(data["boxes"])),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image#[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[-2:]
+        # self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] #* points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        self.enc_features=None
+        # import ipdb; ipdb.set_trace()
+        for (points,) in batch_iterator(self.points_per_batch, points_for_image):
+            batch_data = self._process_batch(cropped_im,points, cropped_im_size, crop_box, orig_size)
+            data.cat(batch_data)
+            del batch_data
+
+        keep_by_nms = batched_nms(
+            data["boxes"].float(),
+            data["iou_preds"],
+            torch.zeros(len(data["boxes"])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        # import ipdb; ipdb.set_trace()
+        data.filter(keep_by_nms)
+        # import ipdb; ipdb.set_trace()
+        # Return to the original image frame
+        data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
+        data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
+
+        return data
+
+    def _process_batch(
+        self,
+        images,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        data = {"image": images, "height": orig_h, "width": orig_w}
+        points=torch.tensor(points,dtype=torch.float).to(images.device)
+        points = torch.cat([points, points.new_tensor([[0.005, 0.005]]).repeat(len(points), 1)], dim=-1)
+        data['targets'] = [dict()]
+        data['targets'][0]['points']=points
+        data['targets'][0]['pb']=points.new_tensor([0.]*len(points))
+        batch_inputs = [data]
+        if self.enc_features is None:
+            masks, iou_preds,mask_features,multi_scale_features= self.predictor.model.evaluate_demo(batch_inputs,None,None,return_features=True, level=self.level)
+            self.enc_features=(mask_features,multi_scale_features)
+        else:
+            masks, iou_preds= self.predictor.model.evaluate_demo(batch_inputs,None,None,self.enc_features[0],self.enc_features[1], level=self.level)
+
+        data = MaskData(
+            masks=masks,
+            iou_preds=iou_preds.flatten(),
+            points=torch.as_tensor(points[:,None].repeat(1,len(self.level), 1).view(-1,4)),
+        )
+        del masks
+        # Filter by predicted IoU
+        keep_mask = data["iou_preds"] > self.pred_iou_thresh
+        data.filter(keep_mask)
+
+        # Calculate stability score
+        data["stability_score"] = calculate_stability_score(
+            data["masks"], 0.0, self.stability_score_offset
+        )
+        # if self.stability_score_thresh > 0.0:
+        keep_mask = data["stability_score"] >= self.stability_score_thresh
+        data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data["masks"] = data["masks"] > 0.0
+        data["boxes"] = batched_mask_to_box(data["masks"])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
+        data["rles"] = mask_to_rle_pytorch(data["masks"])
+        del data["masks"]
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(
+        mask_data: MaskData, min_area: int, nms_thresh: float
+    ) -> MaskData:
+        """
+        Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data["rles"]) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data["rles"]:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode="holes")
+            unchanged = not changed
+            mask, changed = remove_small_regions(mask, min_area, mode="islands")
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros(len(boxes)),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data["boxes"][i_mask] = boxes[i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
--- a/mm_agents/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/inference_semsam_m2m_auto.py
@@ -0,0 +1,108 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+import matplotlib.pyplot as plt
+import cv2
+import io
+from .automatic_mask_generator import SemanticSamAutomaticMaskGenerator
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def inference_semsam_m2m_auto(model, image, level, all_classes, all_parts, thresh, text_size, hole_scale, island_scale, semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image)
+
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+
+    mask_generator = SemanticSamAutomaticMaskGenerator(model,points_per_side=32,
+            pred_iou_thresh=0.88,
+            stability_score_thresh=0.92,
+            min_mask_region_area=10,
+            level=level,
+        )
+    outputs = mask_generator.generate(images)
+
+    from task_adapter.utils.visualizer import Visualizer
+    visual = Visualizer(image_ori, metadata=metadata)
+    sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
+    label = 1
+    # for ann in sorted_anns:
+    #     mask = ann['segmentation']
+    #     color_mask = np.random.random((1, 3)).tolist()[0]
+    #     # color_mask = [int(c*255) for c in color_mask]
+    #     demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+    #     label += 1
+    # im = demo.get_image()
+
+    mask_map = np.zeros(image_ori.shape, dtype=np.uint8)    
+    for i, ann in enumerate(sorted_anns):
+        mask = ann['segmentation']
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        # color_mask = [int(c*255) for c in color_mask]
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        # assign the mask to the mask_map
+        mask_map[mask == 1] = label
+        label += 1
+    im = demo.get_image()    
+    # fig=plt.figure(figsize=(10, 10))
+    # plt.imshow(image_ori)
+    # show_anns(outputs)
+    # fig.canvas.draw()
+    # im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
+    return im, sorted_anns
+
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+def show_anns(anns):
+    if len(anns) == 0:
+        return
+    sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
+    ax = plt.gca()
+    ax.set_autoscale_on(False)
+    polygons = []
+    color = []
+    for ann in sorted_anns:
+        m = ann['segmentation']
+        img = np.ones((m.shape[0], m.shape[1], 3))
+        color_mask = np.random.random((1, 3)).tolist()[0]
+        for i in range(3):
+            img[:,:,i] = color_mask[i]
+        ax.imshow(np.dstack((img, m*0.35)))
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_1o1_box.py
@@ -0,0 +1,144 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+from detectron2.structures import BitMasks
+from semantic_sam.utils import box_ops
+
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_infer_image_box(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    flaten_mask = mask_ori.unsqueeze(0)
+    # import ipdb; ipdb.set_trace()
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.5,0.5]])
+    else:
+        mean_point=points.mean(0)[None]
+        box_xyxy = BitMasks(flaten_mask > 0).get_bounding_boxes().tensor
+        h = mask_ori.shape[0]
+        w = mask_ori.shape[1]
+        box_xywh = (box_ops.box_xyxy_to_cxcywh(box_xyxy) / torch.as_tensor([w, h, w, h])).cuda()
+
+        # point_=points.mean(0)[None]
+        # point=point_.clone()
+        # point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        # point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        # point = point[:, [1, 0]]
+        point=box_xywh
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([1.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, task='demo_box')
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        demo = visual.draw_box(box_xyxy[0])
+        res = demo.get_image()
+        # point_x0=max(0,int(point_[0, 1])-3)
+        # point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        # point_y0 = max(0, int(point_[0, 0]) - 3)
+        # point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        # res[point_y0:point_y1,point_x0:point_x1,0]=255
+        # res[point_y0:point_y1,point_x0:point_x1,1]=0
+        # res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/interactive_idino_m2m.py
@@ -0,0 +1,322 @@
+# --------------------------------------------------------
+# Semantic-SAM: Segment and Recognize Anything at Any Granularity
+# Copyright (c) 2023 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Hao Zhang (hzhangcx@connect.ust.hk)
+# --------------------------------------------------------
+
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+def interactive_infer_image(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        # demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        # res[point_y0:point_y1,point_x0:point_x1,0]=255
+        # res[point_y0:point_y1,point_x0:point_x1,1]=0
+        # res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def interactive_infer_image_3l(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+
+
+    batch_inputs = [data]
+    masks, ious, pred_class, pred_class_score = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, level=[0,1,2])
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    new_pred_class = []
+    new_pred_class_score = []
+    for i in ids:
+        new_pred_class_score.append(pred_class_score[i])
+        new_pred_class.append(pred_class[i])
+    # import ipdb; ipdb.set_trace()
+    for i,(pred_masks_pos,iou, cls_name, cls_score) in enumerate(zip(pred_masks_poses[ids],ious[ids], new_pred_class, new_pred_class_score)):
+        iou=round(float(iou),2)
+        texts=f'{iou}_{cls_name}_{cls_score}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        res[point_y0:point_y1,point_x0:point_x1,0]=255
+        res[point_y0:point_y1,point_x0:point_x1,1]=0
+        res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def interactive_infer_image_semantic(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
+    t = []
+    t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
+    transform1 = transforms.Compose(t)
+    image_ori = transform1(image['image'])
+    mask_ori = transform1(image['mask'])
+    width = image_ori.size[0]
+    height = image_ori.size[1]
+    image_ori = np.asarray(image_ori)
+    images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
+    all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
+
+
+    data = {"image": images, "height": height, "width": width}
+
+    mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
+    mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
+    points=mask_ori.nonzero().float().to(images.device)
+    if len(points)==0:
+        point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
+    else:
+        point_=points.mean(0)[None]
+        point=point_.clone()
+        point[0, 0] = point_[0, 0] / mask_ori.shape[0]
+        point[0, 1] = point_[0, 1] / mask_ori.shape[1]
+        point = point[:, [1, 0]]
+        point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
+    data['targets'] = [dict()]
+    data['targets'][0]['points']=point
+    data['targets'][0]['pb']=point.new_tensor([0.])
+    data['targets'][0]['pb']=point.new_tensor([1.])
+
+
+    batch_inputs = [data]
+    masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
+
+    pred_masks_poses = masks
+    reses=[]
+    ious=ious[0,0]
+    ids=torch.argsort(ious,descending=True)
+
+    text_res=''
+    try:
+        thresh=float(thresh)
+    except Exception:
+        thresh=0.0
+    mask_ls=[]
+    ious_res=[]
+    areas=[]
+    for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
+        iou=round(float(iou),2)
+        texts=f'{iou}'
+        mask=(pred_masks_pos>0.0).cpu().numpy()
+        area=mask.sum()
+        conti=False
+        if iou<thresh:
+            conti=True
+        for m in mask_ls:
+            if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
+                conti=True
+                break
+        if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
+            conti=False
+        if conti:
+            continue
+        ious_res.append(iou)
+        mask_ls.append(mask)
+        areas.append(area)
+        mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
+        mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
+        mask=(mask).astype(np.float)
+        out_txt = texts
+        visual = Visualizer(image_ori, metadata=metadata)
+        color=[0.,0.,1.0]
+        demo = visual.draw_binary_mask(mask, color=color, text=texts)
+        res = demo.get_image()
+        point_x0=max(0,int(point_[0, 1])-3)
+        point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
+        point_y0 = max(0, int(point_[0, 0]) - 3)
+        point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
+        res[point_y0:point_y1,point_x0:point_x1,0]=255
+        res[point_y0:point_y1,point_x0:point_x1,1]=0
+        res[point_y0:point_y1,point_x0:point_x1,2]=0
+        reses.append(Image.fromarray(res))
+        text_res=text_res+';'+out_txt
+    ids=list(torch.argsort(torch.tensor(areas),descending=False))
+    ids = [int(i) for i in ids]
+
+    torch.cuda.empty_cache()
+
+    return reses,[reses[i] for i in ids]
+
+def remove_small_regions(
+    mask: np.ndarray, area_thresh: float, mode: str
+) -> Tuple[np.ndarray, bool]:
+    """
+    Removes small disconnected regions and holes in a mask. Returns the
+    mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ["holes", "islands"]
+    correct_holes = mode == "holes"
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
--- a/mm_agents/task_adapter/semantic_sam/tasks/interactive_predictor.py
+++ b/mm_agents/task_adapter/semantic_sam/tasks/interactive_predictor.py
@@ -0,0 +1,139 @@
+import torch
+import numpy as np
+from torchvision import transforms
+from task_adapter.utils.visualizer import Visualizer
+from typing import Tuple
+from PIL import Image
+from detectron2.data import MetadataCatalog
+metadata = MetadataCatalog.get('coco_2017_train_panoptic')
+
+
+class SemanticSAMPredictor:
+    def __init__(self, model, thresh=0.5, text_size=640, hole_scale=100, island_scale=100):
+        """
+        thresh: iou thresh to filter low confidence objects
+        text_size: resize the input image short edge for the model to process
+        hole_scale: fill in small holes as in SAM
+        island_scale: remove small regions as in SAM
+        """
+        self.model = model
+        self.thresh = thresh
+        self.text_size = hole_scale
+        self.hole_scale = hole_scale
+        self.island_scale = island_scale
+        self.point = None
+
+    def predict(self, image_ori, image, point=None):
+        """
+        produce up to 6 prediction results for each click
+        """
+        width = image_ori.shape[0]
+        height = image_ori.shape[1]
+
+        data = {"image": image, "height": height, "width": width}
+        # import ipdb; ipdb.set_trace()
+        if point is None:
+            point = torch.tensor([[0.5, 0.5, 0.006, 0.006]]).cuda()
+        else:
+            point = torch.tensor(point).cuda()
+            point_ = point
+            point = point_.clone()
+            point[0, 0] = point_[0, 0]
+            point[0, 1] = point_[0, 1]
+            # point = point[:, [1, 0]]
+            point = torch.cat([point, point.new_tensor([[0.005, 0.005]])], dim=-1)
+
+        self.point = point[:, :2].clone()*(torch.tensor([width, height]).to(point))
+
+        data['targets'] = [dict()]
+        data['targets'][0]['points'] = point
+        data['targets'][0]['pb'] = point.new_tensor([0.])
+
+        batch_inputs = [data]
+        masks, ious = self.model.model.evaluate_demo(batch_inputs)
+
+        return masks, ious
+
+    def process_multi_mask(self, masks, ious, image_ori):
+        pred_masks_poses = masks
+        reses = []
+        ious = ious[0, 0]
+        ids = torch.argsort(ious, descending=True)
+
+        text_res = ''
+        mask_ls = []
+        ious_res = []
+        areas = []
+        for i, (pred_masks_pos, iou) in enumerate(zip(pred_masks_poses[ids], ious[ids])):
+            iou = round(float(iou), 2)
+            texts = f'{iou}'
+            mask = (pred_masks_pos > 0.0).cpu().numpy()
+            area = mask.sum()
+            conti = False
+            if iou < self.thresh:
+                conti = True
+            for m in mask_ls:
+                if np.logical_and(mask, m).sum() / np.logical_or(mask, m).sum() > 0.95:
+                    conti = True
+                    break
+            if i == len(pred_masks_poses[ids]) - 1 and mask_ls == []:
+                conti = False
+            if conti:
+                continue
+            ious_res.append(iou)
+            mask_ls.append(mask)
+            areas.append(area)
+            mask, _ = self.remove_small_regions(mask, int(self.hole_scale), mode="holes")
+            mask, _ = self.remove_small_regions(mask, int(self.island_scale), mode="islands")
+            mask = (mask).astype(np.float)
+            out_txt = texts
+            visual = Visualizer(image_ori, metadata=metadata)
+            color = [0., 0., 1.0]
+            demo = visual.draw_binary_mask(mask, color=color, text=texts)
+            res = demo.get_image()
+            point_x0 = max(0, int(self.point[0, 0]) - 3)
+            point_x1 = min(image_ori.shape[1], int(self.point[0, 0]) + 3)
+            point_y0 = max(0, int(self.point[0, 1]) - 3)
+            point_y1 = min(image_ori.shape[0], int(self.point[0, 1]) + 3)
+            res[point_y0:point_y1, point_x0:point_x1, 0] = 255
+            res[point_y0:point_y1, point_x0:point_x1, 1] = 0
+            res[point_y0:point_y1, point_x0:point_x1, 2] = 0
+            reses.append(Image.fromarray(res))
+            text_res = text_res + ';' + out_txt
+        ids = list(torch.argsort(torch.tensor(areas), descending=False))
+        ids = [int(i) for i in ids]
+
+        torch.cuda.empty_cache()
+
+        return reses, [reses[i] for i in ids]
+
+    def predict_masks(self, image_ori, image, point=None):
+        masks, ious = self.predict(image_ori, image, point)
+        return self.process_multi_mask(masks, ious, image_ori)
+
+    @staticmethod
+    def remove_small_regions(
+            mask: np.ndarray, area_thresh: float, mode: str
+    ) -> Tuple[np.ndarray, bool]:
+        """
+        Removes small disconnected regions and holes in a mask. Returns the
+        mask and an indicator of if the mask has been modified.
+        """
+        import cv2  # type: ignore
+
+        assert mode in ["holes", "islands"]
+        correct_holes = mode == "holes"
+        working_mask = (correct_holes ^ mask).astype(np.uint8)
+        n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
+        sizes = stats[:, -1][1:]  # Row 0 is background label
+        small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+        if len(small_regions) == 0:
+            return mask, False
+        fill_labels = [0] + small_regions
+        if not correct_holes:
+            fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+            # If every region is below threshold, keep largest
+            if len(fill_labels) == 0:
+                fill_labels = [int(np.argmax(sizes)) + 1]
+        mask = np.isin(regions, fill_labels)
+        return mask, True
--- a/mm_agents/task_adapter/utils/visualizer.py
+++ b/mm_agents/task_adapter/utils/visualizer.py