Merge branch 'main' into zdy

This commit is contained in:
David Chang
2024-01-02 21:40:54 +08:00
63 changed files with 7664 additions and 151 deletions

View File

@@ -78,7 +78,8 @@ class DesktopEnv(gym.Env):
self.evaluator = task_config["evaluator"]
self.metric: Metric = getattr(metrics, self.evaluator["func"])
self.result_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
self.expected_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
self.expected_getter: Getter = getattr(getters, "get_{:}".format(
self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
self.metric_options: Dict[str, Any] = self.evaluator.get("options", {})
# Initialize emulator and controller
@@ -95,7 +96,7 @@ class DesktopEnv(gym.Env):
# episodic stuffs, like tmp dir and counters, will be updated or reset
# when calling self.reset()
self.tmp_dir: str = self.tmp_dir_base # just an init value, updated during reset
self.tmp_dir: str = self.tmp_dir_base # just an init value, updated during reset
self._traj_no: int = -1
self._step_no: int = 0
self.action_history: List[Dict[str, any]] = []
@@ -165,7 +166,8 @@ class DesktopEnv(gym.Env):
self.evaluator = task_config["evaluator"]
self.metric: Metric = getattr(metrics, self.evaluator["func"])
self.result_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["result"]["type"]))
self.expected_getter: Getter = getattr(getters, "get_{:}".format(self.evaluator["expected"]["type"]))
self.expected_getter: Getter = getattr(getters, "get_{:}".format(
self.evaluator["expected"]["type"])) if "expected" in self.evaluator else None
self.metric_options = self.evaluator.get("options", {})
self.setup_controller.reset_cache_dir(self.cache_dir)
@@ -226,20 +228,12 @@ class DesktopEnv(gym.Env):
"""
Evaluate whether the task is successfully completed.
"""
result_state = self.result_getter(self, self.evaluator["result"])
expected_state = self.expected_getter(self, self.evaluator["expected"]) if "expected" in self.evaluator \
else None
# todo: make this more flexible by refactoring
# eval_func = eval_funcs[self.evaluator["func"]]
# eval_func_vars = {}
#
# for var_name, file_info in self.evaluator["paths"].items():
# path = copy_file_to_local(file_info)
# eval_func_vars[var_name] = path
#
# return eval_func(**eval_func_vars)
result = self.result_getter(self, self.evaluator["result"])
expected = self.expected_getter(self, self.evaluator["expected"])
metric: float = self.metric(result, expected, **self.metric_options)
metric: float = self.metric(result_state, expected_state, **self.metric_options) if expected_state is not None \
else self.metric(result_state, **self.metric_options)
return metric

View File

@@ -41,3 +41,4 @@ def get_vm_file(env, config: Dict[str, str]) -> str:
f.write(file)
return _path

View File

@@ -5,4 +5,4 @@ def get_rule(env, config: R) -> R:
"""
Returns the rule as-is.
"""
return config
return config["rules"]

View File

@@ -0,0 +1,138 @@
# Setup Instructions
## LibreOffice Writer
### Setting Up the python-docx Library
```shell
pip install python-docx
```
## Chrome
### Starting Chrome with Remote Debugging for Python
To enable remote debugging in Chrome, which allows tools like Playwright for Python to connect to and control an existing Chrome instance, follow these steps:
#### Manually Enabling Remote Debugging in Chrome
1. **Locate the Chrome Shortcut**:
- Find the Chrome shortcut that you usually use to open the browser. This could be on your desktop, start menu, or taskbar.
2. **Edit Shortcut Properties**:
- Right-click on the Chrome shortcut and select `Properties`.
3. **Modify the Target Field**:
- In the `Target` field, add `--remote-debugging-port=9222` at the end of the path. Ensure there is a space between the path and the flag you add.
- It should look something like this: `"C:\Path\To\Chrome.exe" --remote-debugging-port=9222`.
4. **Apply and Close**:
- Click `Apply` and then `OK` to close the dialog.
5. **Start Chrome**:
- Use this modified shortcut to start Chrome. Chrome will now start with remote debugging enabled on port 9222.
6. **Confirm Remote Debugging**:
- Open a browser and navigate to `http://localhost:9222`. If you see a webpage with information about active tabs, remote debugging is working.
---
### Setting Up Playwright for Python
Playwright for Python is a browser automation library to control Chromium, Firefox, and WebKit with a single API.
#### Installing Playwright
- Ensure you have Python installed on your system. If not, download and install it from the [Python official website](https://www.python.org/).
- Install Playwright using pip (Python Package Installer). Open a command line or terminal and run:
```bash
pip install playwright
```
- After installing Playwright, you need to run the install command to download the necessary browser binaries:
```bash
playwright install
```
#### Writing a Playwright Script in Python
- Create a Python file for your automation script.
- Import the Playwright module at the beginning of your script:
```python
from playwright.sync_api import sync_playwright
```
- You can now use Playwright's API to control browsers.
#### Example Playwright Script
Here is a simple example to open a page using Playwright:
```python
from playwright.sync_api import sync_playwright
def run(playwright):
browser = playwright.chromium.launch()
page = browser.new_page()
page.goto("http://example.com")
## other actions...
browser.close()
with sync_playwright() as playwright:
run(playwright)
```
- This script launches Chromium, opens a new page, navigates to `example.com`, and then closes the browser.
#### Troubleshooting
- If you encounter issues with Playwright, ensure that your Python environment is correctly set up and that you have installed Playwright and its dependencies correctly.
- For detailed documentation, visit the [Playwright for Python Documentation](https://playwright.dev/python/docs/intro).
## VLC Media Player
### Setting Up VLC's HTTP Interface
To enable and use the HTTP interface in VLC Media Player for remote control and status checks, follow these steps:
#### 1. Open VLC Preferences
- Open VLC Media Player.
- Go to `Tools` > `Preferences` from the menu.
#### 2. Show All Settings
- In the Preferences window, at the bottom left corner, select `All` under `Show settings` to display advanced settings.
#### 3. Enable Main Interfaces
- In the advanced preferences, expand the `Interface` section.
- Click on `Main interfaces`.
- Check the box for `Web` to enable the HTTP interface.
#### 4. Configure Lua HTTP
- Expand the `Main interfaces` node and select `Lua`.
- Under `Lua HTTP`, set a password in the `Lua HTTP` section. This password will be required to access the HTTP interface.
#### 5. Save and Restart VLC
- Click `Save` to apply the changes.
- Restart VLC Media Player for the changes to take effect.
#### 6. Accessing the HTTP Interface
- Open a web browser and go to `http://localhost:8080`.
- You will be prompted for a password. Enter the password you set in the Lua HTTP settings.
- Once logged in, you will have access to VLC's HTTP interface for remote control.
#### Troubleshooting
- If you cannot access the HTTP interface, check if your firewall or security software is blocking the connection.
- Ensure VLC is running and the correct port (default is 8080) is being used.
- If the port is in use by another application, you may change the port number in VLC's settings.

View File

@@ -1,5 +1,7 @@
from .table import compare_table
from .table import check_sheet_list, check_xlsx_freeze, check_xlsx_zoom
from .docs import find_default_font, contains_page_break, compare_docx_files
from .docs import find_default_font, contains_page_break, compare_docx_files, compare_docx_tables, compare_line_spacing, compare_insert_equation
from .docs import compare_font_names, compare_subscript_contains, has_page_numbers_in_footers
from .docs import is_first_line_centered, check_file_exists, compare_contains_image
from .pdf import check_pdf_pages
from .libreoffice import check_libre_locale

View File

@@ -3,6 +3,8 @@ import os
import platform
import sqlite3
from playwright.sync_api import sync_playwright
"""
WARNING:
1. Functions from this script assume that no account is registered on Chrome, otherwise the default file path needs to be changed.
@@ -12,6 +14,7 @@ WARNING:
# todo: move to getter module
# The following ones just need to load info from the files of software, no need to connect to the software
def get_default_search_engine():
if platform.system() == 'Windows':
preference_file_path = os.path.join(os.getenv('LOCALAPPDATA'),
@@ -19,8 +22,10 @@ def get_default_search_engine():
elif platform.system() == 'Darwin':
preference_file_path = os.path.join(os.getenv('HOME'),
'Library/Application Support/Google/Chrome/Default/Preferences')
else:
elif platform.system() == 'Linux':
preference_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Preferences')
else:
raise Exception('Unsupported operating system')
try:
with open(preference_file_path, 'r', encoding='utf-8') as file:
@@ -41,8 +46,10 @@ def get_cookie_data():
elif platform.system() == 'Darwin':
chrome_cookie_file_path = os.path.join(os.getenv('HOME'),
'Library/Application Support/Google/Chrome/Default/Cookies')
else:
elif platform.system() == 'Linux':
chrome_cookie_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Cookies')
else:
raise Exception('Unsupported operating system')
try:
conn = sqlite3.connect(chrome_cookie_file_path)
@@ -65,8 +72,10 @@ def get_bookmarks():
elif platform.system() == 'Darwin':
preference_file_path = os.path.join(os.getenv('HOME'),
'Library/Application Support/Google/Chrome/Default/Bookmarks')
else:
elif platform.system() == 'Linux':
preference_file_path = os.path.join(os.getenv('HOME'), '.config/google-chrome/Default/Bookmarks')
else:
raise Exception('Unsupported operating system')
try:
with open(preference_file_path, 'r', encoding='utf-8') as file:
@@ -78,3 +87,75 @@ def get_bookmarks():
except Exception as e:
print(f"Error: {e}")
return None
def get_extensions_installed_from_shop():
"""Find the Chrome extensions directory based on the operating system."""
os_name = platform.system()
if os_name == 'Windows':
chrome_extension_dir = os.path.expanduser(
'~') + '\\AppData\\Local\\Google\\Chrome\\User Data\\Default\\Extensions\\'
elif os_name == 'Darwin': # macOS
chrome_extension_dir = os.path.expanduser(
'~') + '/Library/Application Support/Google/Chrome/Default/Extensions/'
elif os_name == 'Linux':
chrome_extension_dir = os.path.expanduser('~') + '/.config/google-chrome/Default/Extensions/'
else:
raise Exception('Unsupported operating system')
manifests = []
for extension_id in os.listdir(chrome_extension_dir):
extension_path = os.path.join(chrome_extension_dir, extension_id)
if os.path.isdir(extension_path):
# Iterate through version-named subdirectories
for version_dir in os.listdir(extension_path):
version_path = os.path.join(extension_path, version_dir)
manifest_path = os.path.join(version_path, 'manifest.json')
if os.path.isfile(manifest_path):
with open(manifest_path, 'r') as file:
try:
manifest = json.load(file)
manifests.append(manifest)
except json.JSONDecodeError:
print(f"Error reading {manifest_path}")
return manifests
# The following ones require Playwright to be installed on the target machine, and the chrome needs to be pre-config on port info to allow remote debugging, see README.md for details
def get_open_tabs_info(remote_debugging_url):
with sync_playwright() as p:
# connect to remote Chrome instance
browser = p.chromium.connect_over_cdp(remote_debugging_url)
tabs_info = []
for context in browser.contexts:
for page in context.pages:
title = page.title()
url = page.url
tabs_info.append({'title': title, 'url': url})
browser.close()
return tabs_info
def get_active_tab_info(remote_debugging_url):
with sync_playwright() as p:
# connect to remote Chrome instance
browser = p.chromium.connect_over_cdp(remote_debugging_url)
active_tab_info = {}
for context in browser.contexts:
for page in context.pages():
if page.is_visible("body"): # check the visibility of the page body to determine the active status
active_tab_info = {
'title': page.title(),
'url': page.url,
'content': page.content() # get the HTML content of the page
}
break
if active_tab_info:
break
browser.close()
return active_tab_info

View File

@@ -1,15 +1,20 @@
import xml.etree.ElementTree as ET
import os
from typing import List, Dict, Any
from docx import Document
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT
def find_default_font(expected, config_file_path):
def find_default_font(config_file_path, rules):
"""Find the default font in LibreOffice Writer."""
default_font = None
expected_font = rules["font_name"]
try:
tree = ET.parse(config_file_path)
root = tree.getroot()
# Define the XML namespace used in the file
# Define the XML namespace used in the file
namespace = {'oor': 'http://openoffice.org/2001/registry'}
# Search for the node containing the default font setting for LibreOffice Writer
@@ -19,24 +24,26 @@ def find_default_font(expected, config_file_path):
default_font = value.text
except Exception as e:
print(f"Error: {e}")
return 1 if default_font == expected else 0
return 1 if default_font == expected_font else 0
def contains_page_break(docx_file):
doc = Document(docx_file)
namespaces = {'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'}
for paragraph in doc.paragraphs:
for run in paragraph.runs:
br_elems = run.element.findall('.//w:br', namespaces)
for br in br_elems:
if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
if br is not None and '{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type' in br.attrib and \
br.attrib['{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type'] == 'page':
return 1
return 0
def compare_docx_files(file1, file2):
def compare_docx_files(file1, file2):
doc1 = Document(file1)
doc2 = Document(file2)
@@ -53,6 +60,129 @@ def compare_docx_files(file1, file2):
return 1
def compare_docx_tables(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
# get list of tables in docx
tables1 = doc1.tables
tables2 = doc2.tables
if len(tables1) != len(tables2):
return 0
# Compare each table content
for table1, table2 in zip(tables1, tables2):
if len(table1.rows) != len(table2.rows) or len(table1.columns) != len(table2.columns):
return 0
# Compare each cell
for i in range(len(table1.rows)):
for j in range(len(table1.columns)):
if table1.cell(i, j).text != table2.cell(i, j).text:
return 0
return 1
def compare_line_spacing(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
if len(doc1.paragraphs) != len(doc2.paragraphs):
return 0
# Compare each paragraph line spacing
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
spacing1 = para1.paragraph_format.line_spacing
spacing2 = para2.paragraph_format.line_spacing
if spacing1 != spacing2:
return 0
return 1
def compare_insert_equation(docx_file1, docx_file2):
if not compare_docx_files(docx_file1, docx_file2):
return 0
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
# Compare each paragraph if it contains equation
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
if run1.element.xpath('.//w:object') and run2.element.xpath('.//w:object'):
return 1
return 0
def compare_font_names(docx_file, rules: List[Dict[str, Any]]):
doc = Document(docx_file)
expected_font = rules["font_name"]
for paragraph in doc.paragraphs:
for run in paragraph.runs:
font_name = run.font.name
if font_name != expected_font:
return 0
return 1
def compare_subscript_contains(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
# check if two paras both contain subscript
if run1.font.subscript and run2.font.subscript:
return 1
return 0
def has_page_numbers_in_footers(docx_file):
doc = Document(docx_file)
for section in doc.sections:
footer = section.footer
if footer is None:
return 0
footer_text = footer.paragraphs[0].text if footer.paragraphs else ''
if not any(char.isdigit() for char in footer_text):
# if no digit in footer, then no page number
return 0
return 1
def is_first_line_centered(docx_file):
doc = Document(docx_file)
first_paragraph = doc.paragraphs[0]
# check if the first line is center justified
return 1 if first_paragraph.paragraph_format.alignment == WD_PARAGRAPH_ALIGNMENT.CENTER else 0
def check_file_exists(directory, filename):
file_path = os.path.join(directory, filename)
return 1 if os.path.isfile(file_path) else 0
def compare_contains_image(docx_file1, docx_file2):
doc1 = Document(docx_file1)
doc2 = Document(docx_file2)
for para1, para2 in zip(doc1.paragraphs, doc2.paragraphs):
for run1, run2 in zip(para1.runs, para2.runs):
if ('graphicData' in run1._element.xml and 'graphicData' not in run2._element.xml) or (
'graphicData' not in run1._element.xml and 'graphicData' in run2._element.xml):
return 0
return 1
# file1 = 'path/to/file1.docx'
# file2 = 'path/to/file2.docx'
@@ -60,6 +190,6 @@ def compare_docx_files(file1, file2):
# Replace 'your_document.docx' with the path to your document
# result = contains_page_break('your_document.docx')
# print(result)
#config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
#print(find_default_font("Ani", config_path))
# config_path = "/home/[username]/.config/libreoffice/4/user/registrymodifications.xcu"
# print(find_default_font("Ani", config_path))

View File

@@ -1,5 +1,7 @@
import os
import platform
import subprocess
import ctypes
import os
# todo: move to getter module
@@ -13,3 +15,43 @@ def get_desktop_path():
return os.path.join("/home", username, "Desktop")
else:
raise Exception("Unsupported operating system")
def get_wallpaper():
def get_wallpaper_windows():
SPI_GETDESKWALLPAPER = 0x73
MAX_PATH = 260
buffer = ctypes.create_unicode_buffer(MAX_PATH)
ctypes.windll.user32.SystemParametersInfoW(SPI_GETDESKWALLPAPER, MAX_PATH, buffer, 0)
return buffer.value
def get_wallpaper_macos():
script = """
tell application "System Events" to tell every desktop to get picture
"""
process = subprocess.Popen(['osascript', '-e', script], stdout=subprocess.PIPE)
output, error = process.communicate()
if error:
print("Error:", error)
else:
return output.strip().decode('utf-8')
def get_wallpaper_linux():
try:
output = subprocess.check_output(["gsettings", "get", "org.gnome.desktop.background", "picture-uri"])
return output.decode('utf-8').strip().replace('file://', '').replace("'", "")
except Exception as e:
print("Error:", e)
return None
os_name = platform.system()
if os_name == 'Windows':
return get_wallpaper_windows()
elif os_name == 'Darwin':
return get_wallpaper_macos()
elif os_name == 'Linux':
return get_wallpaper_linux()
else:
return "Unsupported OS"

View File

@@ -0,0 +1,87 @@
import os
import platform
import requests
from xml.etree import ElementTree
import pygetwindow as gw
import pyautogui
def read_vlc_config(setting_name):
"""
Reads the VLC configuration file to check for a specific setting.
# Example usage
setting_name = 'recordings_folder='
setting = read_vlc_config(setting_name)
"""
# Common paths for VLC config file on different operating systems
paths = {
'Windows': os.path.expanduser('~\\AppData\\Roaming\\vlc\\vlcrc'),
'Darwin': os.path.expanduser('~/Library/Preferences/org.videolan.vlc/vlcrc'),
'Linux': os.path.expanduser('~/.config/vlc/vlcrc')
}
os_type = platform.system()
config_path = paths.get(os_type)
if not config_path or not os.path.exists(config_path):
print("VLC config file not found for this operating system.")
return None
try:
with open(config_path, 'r', encoding="utf-8") as file:
for line in file:
if line.startswith(setting_name):
return line.strip()
except IOError as e:
print(f"Error reading config file: {e}")
return None
def get_vlc_playing_info(host='localhost', port=8080, password='password'):
"""
Gets the current playing information from VLC's HTTP interface.
"""
url = f'http://{host}:{port}/requests/status.xml'
try:
response = requests.get(url, auth=('', password))
if response.status_code == 200:
tree = ElementTree.fromstring(response.content)
status = tree.find('state').text
if status == 'playing':
file_info = tree.find('information/category[@name="meta"]/info[@name="filename"]').text
return status, file_info
return status, None
except Exception as e:
print(f"Error: {e}")
return None, None
def is_vlc_fullscreen():
"""
Checks if the VLC window is in full-screen mode.
When VLC is in full-screen mode, its window size matches the screen size with no borders.
"""
try:
# Get the VLC window; adjust the title as per your VLC window's title
vlc_window = gw.getWindowsWithTitle('VLC media player')[0] # Adjust title if needed
if not vlc_window:
return False
# Get screen size
screen_width, screen_height = pyautogui.size()
# Check if VLC window size matches the screen size
return (vlc_window.width == screen_width and vlc_window.height == screen_height)
except IndexError:
# VLC window not found
print("VLC window not found.")
return False
except Exception as e:
print(f"An error occurred: {e}")
return False

View File

@@ -3,10 +3,40 @@
"snapshot": "libreoffice_writer",
"instruction": "Make the line spacing of first two paragraph into double line spacing",
"source": "https://www.youtube.com/watch?v=Q_AaL6ljudU",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1-svVsH-l2ofufEKuN-cYrIrvXNobtATE&export=download&authuser=0&confirm=t&uuid=be7f891a-f858-48f5-a72d-4e42bbfb8b65&at=APZUnTXzBnaeSJjmxeh4zG03pzA0:1704179807785",
"path": "Desktop/Double_Line_Spacing.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Double_Line_Spacing.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_line_spacing",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1-svVsH-l2ofufEKuN-cYrIrvXNobtATE&export=download&authuser=0&confirm=t&uuid=be7f891a-f858-48f5-a72d-4e42bbfb8b65&at=APZUnTXzBnaeSJjmxeh4zG03pzA0:1704179807785",
"dest": "Double_Line_Spacing_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Double_Line_Spacing.docx",
"dest": "Double_Line_Spacing.docx"
}
}
}

View File

@@ -1,12 +1,42 @@
{
"id": "0b17a146-2934-46c7-8727-73ff6b6483e8",
"snapshot": "libreoffice_writer",
"instruction": "Enter subscript",
"source": "https://ask.libreoffice.org/t/how-to-enter-superscript-and-subscript-in-libreoffice-base-forms-reports/23413",
"config": [],
"instruction": "Change the 2 in H2O to a subscript.",
"source": "https://askubuntu.com/questions/245695/how-do-you-insert-subscripts-and-superscripts-into-ordinary-non-formula-text-i",
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1Nx5AoKNM7tcDRE6y_qjNIDrPOKqhNyfm&export=download&authuser=0&confirm=t&uuid=bb4de348-3bbf-46a2-95b2-e2719c67547a&at=APZUnTUeA-BW7mkQsEw7NGm272zx:1704172916742",
"path": "Desktop/Enter_Subscript.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Enter_Subscript.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_docx_files",
"result": {
"type": "vm_file",
"path": "Desktop/Enter_Subscript.docx",
"dest": "Enter_Subscript.docx"
},
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1AaKXeD9ZgfMykgijZ4G8MEzUjmMJElkq&export=download&authuser=0&confirm=t&uuid=5e347f0d-4efc-4478-878e-d89455d1593b&at=APZUnTWCYWfsD4eCeG52VJiK8-xB:1704172886196",
"dest": "Enter_Subscript_Gold.docx"
}
}
}

View File

@@ -3,10 +3,35 @@
"snapshot": "libreoffice_writer",
"instruction": "Add page number for every page at the bottom left",
"source": "https://ask.libreoffice.org/t/how-to-start-page-numbering-on-a-certain-page/39931/4",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1aDWe-vAmcfQSgtPjFfrncq8ZFnCy4uUK&export=download&authuser=0&confirm=t&uuid=788af72a-ddaf-4ba3-aedb-96f34cc4d815&at=APZUnTVSRSSfMGcjXqLzvMixnkp6:1704179663299",
"path": "Desktop/Add_Page_Number_Bottom_Left.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Add_Page_Number_Bottom_Left.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "has_page_numbers_in_footers",
"result": {
"type": "vm_file",
"path": "Desktop/Add_Page_Number_Bottom_Left.docx",
"dest": "Add_Page_Number_Bottom_Left.docx"
}
}
}

View File

@@ -3,10 +3,41 @@
"snapshot": "libreoffice_writer",
"instruction": "Change the font to \"Times New Roman\" throughout the text.",
"source": "https://ask.libreoffice.org/t/how-do-i-change-the-font-for-the-whole-document-in-writer/9220",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1fIHNzFm8JabWoLKOnxrFM722fQ1d_huK&export=download&authuser=0&confirm=t&uuid=d11a8dda-1e4e-4dc9-b05c-e6b47624dbf0&at=APZUnTVG0ViFnKJa00314wVr3uP9:1704185871014",
"path": "Desktop/Change_Font_Through_File.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Change_Font_Through_File.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_font_names",
"expected": {
"type": "rule",
"rules": {
"font_name": "Times New Roman"
}
},
"result": {
"type": "vm_file",
"path": "Desktop/Change_Font_Through_File.docx",
"dest": "Change_Font_Through_File.docx"
}
}
}

View File

@@ -3,10 +3,35 @@
"snapshot": "libreoffice_writer",
"instruction": "center-justify the first line",
"source": "https://askubuntu.com/questions/1066351/how-do-you-center-align-in-libreoffice#:~:text=Ctrl%20%2B%20e%20will%20Center%20align%20the%20cursor%20for%20you.",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1IQ4rKyHMOui71YlyL7huggpLYFYtj923&export=download&authuser=0&confirm=t&uuid=014c2335-c0c6-4712-9d5a-ca8d3217e07f&at=APZUnTVrM698NQgSh4hqYXR8cjDc:1704185072996",
"path": "Desktop/Centering_First_Line.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Centering_First_Line.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "is_first_line_centered",
"result": {
"type": "vm_file",
"path": "Desktop/Centering_First_Line.docx",
"dest": "Centering_First_Line.docx"
}
}
}

View File

@@ -3,10 +3,40 @@
"snapshot": "libreoffice_writer",
"instruction": "Replace all newlines with paragraph marks in LibreOffice Write",
"source": "https://stackoverflow.com/questions/71685737/how-to-replace-all-newlines-with-paragraph-marks-in-libreoffice-write",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=18XFjPVUnLG_-KOM5sn-Sk74HP_JHivMy&export=download&authuser=0&confirm=t&uuid=d23041bc-2ddd-42c4-84ae-481b953f021c&at=APZUnTVYh0AK0245qsDOCol7SdMB:1704185512767",
"path": "Desktop/Replace_Newlines_with_Paragraph_Marks.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Replace_Newlines_with_Paragraph_Marks.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
"evaluator": {
"func": "compare_line_spacing",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1bP_noic02MuzrM8CdJIQN7F1gN4N8sel&export=download&authuser=0&confirm=t&uuid=657e0e4f-7b96-4d7e-83f4-99b79c68708f&at=APZUnTX7HsmefsMlzQaCGK2fg5Em:1704185514197",
"dest": "Replace_Newlines_with_Paragraph_Marks_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Double_Line_Spacing.docx",
"dest": "Replace_Newlines_with_Paragraph_Marks.docx"
}
}
}

View File

@@ -3,10 +3,32 @@
"snapshot": "libreoffice_writer",
"instruction": "Export the current document into PDF, keep the file name",
"source": "https://www.libreofficehelp.com/save-export-writer-documents-in-pdf-epub-format/",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1czb_13MshoiM-yCxvUGYD8OnIIrf-3VX&export=download&authuser=0&confirm=t&uuid=e7c30b67-7fac-4b64-a222-d04bc7c82842&at=APZUnTUA1te5vt7L__zJ7xuMs48e:1704177347643",
"path": "Desktop/Save_Writer_PDF.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Save_Writer_PDF.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "check_file_exists",
"file_name": "Save_Writer_PDF.pdf",
"directory": "/home/user/Downloads/"
}
}

View File

@@ -3,10 +3,40 @@
"snapshot": "libreoffice_writer",
"instruction": "Insert the equation \"(a + b)^2 = a^2 + 2 a b + b^2\"",
"source": "https://askubuntu.com/questions/319593/how-to-type-science-equations-in-libre-office",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1FgMp7Ny63eXzeF23qHYhqQux31djlkah&export=download&authuser=0&confirm=t&uuid=d6b5208d-3b3a-4972-a641-ed738a419fdb&at=APZUnTX16Fz8Qg-B0NWpWgC-3Dyu:1704184410221",
"path": "Desktop/Insert_Equation.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Insert_Equation.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_insert_equation",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1hMFJnwHs7Iaexz3b9O2LJQUfsJ2wiwZ9&export=download&authuser=0&confirm=t&uuid=2abb49fb-d9c7-46cf-bc21-e69ecb9cefc6&at=APZUnTVzEZjChcUb4MIoxuq4cGea:1704184411805",
"dest": "Insert_Equation_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Insert_Equation.docx",
"dest": "Insert_Equation.docx"
}
}
}

View File

@@ -3,10 +3,40 @@
"snapshot": "libreoffice_writer",
"instruction": "Insert a 7*5 empty table",
"source": "https://www.youtube.com/watch?v=l25Evu4ohKg",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1kXBP0jMxTeVahzFLYbYHJtjjgmuzrA8R&export=download&authuser=0&confirm=t&uuid=f8b9bad3-415d-4d39-a4fb-05a4cf881cf0&at=APZUnTXaohwzl8_2RDF_tgUsP9cH:1704181463579",
"path": "Desktop/Insert_Empty_Table.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Insert_Empty_Table.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_docx_tables",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=14JfHsW9GvbhORdtVAtvEbOi00MqEyHfb&export=download&authuser=0&confirm=t&uuid=3dba2459-ac37-4cad-a982-adecd406382a&at=APZUnTVQUqUPq_WacgY2xu4PvAKB:1704181465512",
"dest": "Insert_Empty_Table_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Insert_Empty_Table.docx",
"dest": "Insert_Empty_Table.docx"
}
}
}

View File

@@ -1,12 +1,53 @@
{
"id": "6ada715d-3aae-4a32-a6a7-429b2e43fb93",
"snapshot": "libreoffice_writer",
"instruction": "Insert the image which is in IMAGE_PATH where my cursor is",
"instruction": "Copy the screenshot 1.jpg from the desktop to where my cursor is locatedInsert the image which is in IMAGE_PATH where my cursor is",
"source": "https://www.quora.com/How-do-you-insert-images-into-a-LibreOffice-Writer-document",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1ebLG0gmqYv41ek4UmKWhFsxBnoUSGjKp&export=download&authuser=0&confirm=t&uuid=8f7d7bee-1fe4-4c4c-8b69-8aaf47199c57&at=APZUnTVYUvYTopUXCVs69QWWwPbq:1704173993139",
"path": "Desktop/Insert_Image_At_Cursor.docx"
}
]
}
},
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=1QfjQ4SKtjKDXWpqa2u6mC_KtB3ASEK5O&export=download&authuser=0&confirm=t&uuid=06af00b9-58f3-4691-a6a3-34309c80cbbb&at=APZUnTVZpE1lMxcvGG0cdt5zuxZ_:1704174003198",
"path": "Desktop/1.jpg"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Insert_Image_At_Cursor.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_contains_image",
"result": {
"type": "vm_file",
"path": "Desktop/Insert_Image_At_Cursor.docx",
"dest": "Insert_Image_At_Cursor.docx"
},
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1xbhlfqGrPutHHi2aHg66jwXD-yaZpe9j&export=download&authuser=0&confirm=t&uuid=427765e0-3f97-4a72-92db-a1fe7cdde73b&at=APZUnTUhNLh2PDu4OGkCVQW-LPCd:1704173991269",
"dest": "Insert_Image_At_Cursor_Gold.docx"
}
}
}

View File

@@ -3,10 +3,40 @@
"snapshot": "libreoffice_writer",
"instruction": "Convert the content seperated by commas to a table",
"source": "https://www.youtube.com/watch?v=l25Evu4ohKg",
"config": [],
"config": [
{
"type": "download",
"parameters": {
"files": [
{
"url": "https://drive.usercontent.google.com/download?id=139-NmslBR_9qlD7hUO08xj_VFffV95eM&export=download&authuser=0&confirm=t&uuid=64a6c35d-f3ce-4c25-9f83-4a952e24c5ad&at=APZUnTUL1GMR_QbpFQnC9fPwkdqa:1704183959196",
"path": "Desktop/Convert_Text_To_Table.docx"
}
]
}
},
{
"type": "open",
"parameters": {
"path": "Desktop/Convert_Text_To_Table.docx"
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"libreoffice_writer"
],
"evaluator": "evaluation_dir"
}
"evaluator": {
"func": "compare_docx_tables",
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1fs2msYaxnEZL9XASENQMZIag2MTxIBJs&export=download&authuser=0&confirm=t&uuid=6c71f008-082c-4f0c-9ffc-0a802f5cbfe6&at=APZUnTVDpucMDfk5P2T-0dZx_KVV:1704183960360",
"dest": "Convert_Text_To_Table_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Convert_Text_To_Table.docx",
"dest": "Convert_Text_To_Table.docx"
}
}
}

View File

@@ -1,7 +1,7 @@
{
"id": "adf5e2c3-64c7-4644-b7b6-d2f0167927e7",
"snapshot": "libreoffice_writer",
"instruction": "Helping me adding CITATION_TEXT to my reference list, and add a cross reference after the word \"WHERE_WE_ADD_REFERENCE\"",
"instruction": "Helping me adding \"C. Luo and M. J. Carey, \"LSM-based storage techniques: a survey,\" The VLDB Journal, vol. 29, no. 1, pp. 393418, 2020.\" to my reference list, and add a cross reference at the end of the first paragraph",
"source": "https://seekstar.github.io/2022/04/11/libreoffice%E5%BC%95%E7%94%A8%E6%96%87%E7%8C%AE/",
"config": [
{
@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://drive.google.com/uc?export=download&id=1boNZ3JuqUx2apMqExL3UX9A4Sopi77ke7yKnIO3cpbg",
"url": "https://drive.usercontent.google.com/download?id=1xOfwImgkPzdmjQomj-MCFd8nQS75OjaH&export=download&authuser=0&confirm=t&uuid=7eb91c26-dad5-4480-b1ec-35e506cde1e4&at=APZUnTW01MvBI_gkC8yoiyAVs7yi:1704188254979",
"path": "Desktop/Add_Citation_Cross_Reference.docx"
}
]
@@ -30,13 +30,13 @@
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://doc-14-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/erhjsu6vpod00o7ruf95hlnqkk/1703931360000/108888117743638485671/108888117743638485671/1tiN6A9-zQ2gfDPTWAYK5mv5JbUe2Y_PqPIyuvsu7PgA?format=docx&dat=AOBvIb1uRgycIK4pNup7ZrnJqwNjYOgTUlrhxAc8DnBWzUt9zDxLm3e4s0KQytzQ1qvFZaBr8-ymrVv7Mmb7ovpVk4k8sgS_2MRD1m-tMUDiUEGFtoxrECd4Xoaspuwb-BZttyU1cCdY3U12qcNWy5Cts_uys6ouKZok01Z7s1J233udfrMbXvDt_X-HeNo_7e6Bh64ZC4ohHOKZddsuayKYxPTKpgnho_8FPuWXqZDKyfYRDoTXxGWv-WrZSVqRSHP6GMtBdWc1-QBuWzH_iRTM64joeveSDppMjMeB5bjdJQ7EXf-EjA8MjSxtvQQGBmun7PoZ-W7fLmQ1E3fZKJ5BwQDOIJHDCBar83iHHoXOUJ1Q5UbkKcCS0nJ_pprCzRYXLSeVfN0_bdGuY2lSE8GhX-yGlyGIjAIZK-YulOFXwV0--4aD10rh43A5GLmSLeNZe6maUU33j1V-zUtp1qPgRk3SnPJENNOXf-sOYAvQqSgROSBvAwElqgHUMD_ROK692M7_7OtFe4sjs0eVnBzROEHy-ZznXqdSXJj6-2vloXHWfswPfE-Mq5kc7F1zX4CY6H1kQ-zgHzeLX-qQA6YmgZPJ0pLzFkAkBiMAjPigA_2dy7jk-niePSbZ9DcgYoX6iv6MkJ0y6JE_HQF7Gr6kDBiOjOyDp7gFoMj35F41Fac1wpSJmoiUEGLg0qGRBZ6BPc54m-AAFuy-2s4BUUtPgk-FlTD1jSpHDXLbJ-VQFglx1CYpfqFAnmIE8yseQPh3GqQYyCtCfD-zzO-CRTT9A-XOQVuH27npfk2gMDKtGwJr7XhNL8lL9b8540uTjt9nFnmNfDZCFK01VULdHZesSBedqM4iApgVVnjok8nmYw14e9WSgJOdjeiYAwI",
"path": "https://drive.usercontent.google.com/download?id=1wFQU7hkAT2wmSHTgM22F9Ep4WXmymEMW&export=download&authuser=0&confirm=t&uuid=a7ea1eec-678b-4407-b023-df13cc6f8c54&at=APZUnTW3WoqOfS9A1BW79XfV8jKh:1704188260410",
"dest": "Add_Citation_Cross_Reference_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Add_Citation_Cross_Reference.xlsx",
"dest": "Add_Citation_Cross_Reference.xlsx"
"path": "Desktop/Add_Citation_Cross_Reference.docx",
"dest": "Add_Citation_Cross_Reference.docx"
}
}
}

View File

@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://doc-08-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/50aih0mm4k09m25qfa3qrhiuk4/1703929965000/108888117743638485671/108888117743638485671/1NcJF7vVondy_r7toxlB9hLPAZa_SE2J8Q8jwMXS8VUA?format=docx&dat=AOBvIb2I6t_SHXA1k8jP6BwfHmYlD9vYHbrSA2-mYotTRWVDp45o4YRrwJXoy21qBnq92d646pP7IQH6-gXTi7oDIcGqD0iYR6CLD1s2PyYX2F8wQ703_cw61GoVYGf8BoorXDY6Y44dfXY0j2RigWDdbimS1rSLUy3TGEmTl8jZq71zrNUKiS25zCsfuXONsexH0tGI1d8LfnVKrFLCQvIlrVF7lV9lgi4lJhuUwIIKF1JdziNoNBohbCuhv-h4iGPRyoFxC4hZAOVJEHy1wIvBA64rNsw1N_nplLxx42I7MC9-3F24Lkxi3xfJ81nEYSx8ma5D9V_AHLLRLmIrpPKYk1s47qPQxbSGrcO1362WJeMxb8lys71APnPwfWbodnxZLdJR2x2WfdYiQWpZGRzBf3-CeQmORrMQDSwWOHsEGMiCw8qTKevzhY1s4aZWBxpQO7ocCoL1gLOxxEvj4eSLvxp2S1u_dFjjr-dcMxt9-Xu210BGd-1Q6kUYzexRuI6I1vkWxDFn7GHgkVf-RhbMT52W_FFOo2Um4rXIfV62W5_nZrmJjz6KNOGdAbIJdkmTrS_lESb6GDmkOFwNarmTlZVOCDN-On7HGaYF1KvX0hobR0559-wKetJj2diqCDOlDXFemtkzvX-CDCRBnDmQxq1ZaQEsjAHhu7sE9jlZT0ywUHV3VpKBcepolqaCRAhX0gCf1cSht7LDHODeX9Hbn3tz810aYlETCJQo9QScbN87i4IV3qFbezwymMi0ZDLgWW0BtEa1gFMY89om6YGscnCHUHGCGy_GW2XscOiQq3rJngCmuu4Ivfta_7GB0e9NeflOIO3wlCpTlw6aQVh9sIB0MMTpDaZ6V1SXSnPInj15tLbCiAPXzNxfg-8",
"url": "https://drive.usercontent.google.com/download?id=1LZ1_U9CyR8oOkqfbY6HMLWr4wBVYhCLR&export=download&authuser=0&confirm=t&uuid=574d533a-8df9-4f33-bebd-689b623f27a9&at=APZUnTVruCDRxY661_PVT9BA3839:1704180420603",
"path": "Desktop/Capitalize_First_Letter.docx"
}
]
@@ -30,13 +30,13 @@
"func": "compare_docx_files",
"expected": {
"type": "cloud_file",
"path": "https://doc-10-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/tj4rm8e6bt50lht09qgev340nk/1703930035000/108888117743638485671/108888117743638485671/1vmb6LphTM8jKf2K7fkrmeWhy_wEgMtvthJx7KFtBwhE?format=docx&dat=AOBvIb1sqCYIZBDWtevTgWhVtsxSMJl1RuUy-MFBRUZszTYkE1Y_qhzC_RfgEvMx5kJz_GECf3mFQkPQq6Re7HFsKVaDU4_KXQE13ZM1cCIEtAEJ7UKZPcI55xO_dMh7ig_1wArgt_jviaEmA5RJcdLHu-omc7W23lcTfPZQpZTextkU7vtgQJGceYeC25JIdRPpsTYVXvqhD5Bjtq-ArRQO4f3huW-TfFKdiVh3MFjkuMx8fMg3l60l8JH_lUqw2BqCqzQDVeD_ajYrmzrFQMP5rFXj353S5HtCiAdSlClI1I6nRMLAELwtsgkqEIc5pwNxcOUKZU1lHkCl1wljzOkrLxRSPlQ1Hb2h0YbRVbPARBB6ywe5QooHn9HatQr_4hkzMTRug4Qv-fo39-F5Uy5bNeGPlK4tDtOUPUDUQs0Kbnn_gT9zzSUCXj4BW85tmNtCNc-Akt4_tPXGyEqlNELpFeBaK27EETF6S93N7C5OU9SfYbL8u29YgTLq1229JmJ3dcUr8yDv2oFLx9x_PNbAStSYABZaDCi1B5B2gPSUvxdQ7CtkoFodD0e7XwBWqDi3jC1N2LdBa8mUsIkFVJvI3PmixODcgzJb5MTkKBwWKHw0UqV-Zsl2whtWEEMeeu6HdgsIiuzSs56dUDsOIJXhu2PfIojjyoX91-NeffGEVQ5-w9l3_EfNpOUHLli3_Ju8w5YvjNoS9gU-g2HTdljnWydN0j0jiz1otjiE0oQxMzVqvWNMa3Qap2vPvQMVoOB_7SwBzcEVmi-SnitWvrXIXs3o585Qc6MBeDQ20D0VhJGsFJ8vVqxtDI8AOIC-t8NaYatFoKXuQLJckJ1wcqA7NmFxWa2hWU79l6dwPztsK9w0VJQyMSwJOMFPXWU",
"path": "https://drive.usercontent.google.com/download?id=1ykIGg48GjYUOSw8t44evShJE2HX7R5w3&export=download&authuser=0&confirm=t&uuid=ed81d6cd-6044-49f1-be86-35adeaeeea00&at=APZUnTUxW8WLyPr-_smA2Mnwpuuv:1704180422490https://drive.usercontent.google.com/download?id=1ykIGg48GjYUOSw8t44evShJE2HX7R5w3&export=download&authuser=0&confirm=t&uuid=ed81d6cd-6044-49f1-be86-35adeaeeea00&at=APZUnTUxW8WLyPr-_smA2Mnwpuuv:1704180422490",
"dest": "Capitalize_First_Letter_Gold.docx"
},
"result": {
"type": "vm_file",
"path": "Desktop/Capitalize_First_Letter.xlsx",
"dest": "Capitalize_First_Letter.xlsx"
"path": "Desktop/Capitalize_First_Letter.docx",
"dest": "Capitalize_First_Letter.docx"
}
}
}

View File

@@ -1,7 +1,7 @@
{
"id": "ecc2413d-8a48-416e-a3a2-d30106ca36cb",
"snapshot": "libreoffice_writer",
"instruction": "Insert a blank page here",
"instruction": "Insert a blank page",
"source": "https://www.quora.com/How-can-I-insert-a-blank-page-on-libreoffice",
"config": [
{
@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://doc-0s-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/nqen29nemcpds1sk8t0ck303a8/1703926705000/108888117743638485671/108888117743638485671/10KfmCo_A5M04w4TA_qzPkd6vw04K3xzBzluIgeRCXuo?format=docx&dat=AOBvIb208vnoN2v6hNgA1FJ6jqegOR6oyVuKcoeroXXtmMiXWTao5vsteKCqxz2lnj5rioDVslkXyDQPCKYBnAJhtvKsIE8cJ7V2DtYuL9jFKYtJgClq2RsfpGE0hgvlLuoLZmeaRLaEJv10NTPGRTULPYwgN-RqLdnyG4EojXxrTKxk4TfVM7xAqob7uE24vFyGMH4uMctC2obdXNsDKnZ4dM1eao-3ABtlyPzSVpT891ziY_SMclP7l7y7cLqw4S11zbbnteVYpM5ytRlgpWKFrKvCmX8gRKpT0ENcNlL4ILJdi3KOAhU97X3vQNkS0LyqmAzKjBeuxz5tD3CuAj7LN1xu2t-DaLVvvrZPm_-XHFGvbHdy5wY66HtMqqPmaRb9_898Tl5ODZxfd5XP1CCRw-c8ohA2Jmdl86Scr8XDA7C_mAT8m_E1FLvJLJhJ_TyL74H0-TRiAPA2noaX2PUmOt4G1qFF_aIOn56iPPt3hB3eHvgthD0bVuW3TZUyr6cP4ZM_TF7g9awhXa1xWusltHItieNfNaJOPiI4Lacon_uICbbpSvEhuq5-apCsnwXpKIvK18UKP5u1Fw1Zb8AhAocJpHLxej87mInzYfFr7XAdf1kiPPxh1zRL2yW_Qe-J4YxWn0oBRrNrf_IgfQK_z9QRXgzzS3xaby2AsmA0qMNHIbMT73Uvha0TvO5UxozPc-aejeuaoi7ot27uSAwd0Cd4Yi-d4e7qfqVgNqvLl-psT9ZZ7cWq8vhU2lPiHrlmhVIwiWjf-s57gRNyXN99SY7MLG-b_JhOI43JgzZzfhjMc0EG2UrCxEEiOOGCp57BwH9FjqM1SQSenAlPmy28e8wCShBwZba_WUbwStumKQakIkwYqeoc0VoJN38",
"url": "https://drive.usercontent.google.com/download?id=1sDufDSC4foI379-Jikya9WK7FBUSqgrt&export=download&authuser=0&confirm=t&uuid=0abd82d6-2b2c-49bc-af5e-49bfe1c99278&at=APZUnTURIqTNJcIHBcMP2BxEaGXr:1704174850900",
"path": "Desktop/Insert_Blank_Page.docx"
}
]

View File

@@ -9,7 +9,7 @@
"parameters": {
"files": [
{
"url": "https://doc-10-20-docstext.googleusercontent.com/export/hglu5spf1pl3kjo47q5hl516lk/jq5b3etcrfdc25il9orjsk8jgo/1703926500000/108888117743638485671/108888117743638485671/1ufVZd-Uibt9pVClmK9BceqMR6iQNSekH5ECxysnPehY?format=docx&dat=AOBvIb3K-ByHFQ8OY7SbFlbA41gbWygryhR0tjcDhZuUWmdje6d2VxzZsK00RoorX_LOOjpnln1zFpw9-W1PLbjKMx1-cOGZfuVpqBiL3mOiYLdQPxqqPgrRKjzJzeD0SZOCK96nu8wIGoY-tDVwAoGzf98-lxjDOO1Z3slrW4YeTUPZQ17EusYw75S8FzBIMxW9UGzMPMtubUK_JVrHQOU-ghu8bz0atPRrkB44ysWeF0W063sg03ysAnb1557Ie0p3RgrcMc9aeGtKvQFCo0Tr7BkR93D2klp6M5pDMJekgtUGxurwiEmNeZ6nRhp-bYoev1uesAhGzZONVi_1DtaHvGzL6MGMIzfV5rWtMXbFI1CBwtP00AuF5qFOD6l2wkRVogas48MWOxBCX-bcUHOxezVDmxb0ohfCveIDMq0s8ebY5HggfrE9I8pMs-2GNPABUSr4S7MkRO-2yzy-j8pgTtzO3QRc146gd9Hci6aYoAnBIludK31AsLckcVba-OrEyB7Lx31sfzvdITS8nZ4Cg_JWMV9CugNgF_8w0SprvDMw9vsoEjYaJpY2Z_K445GGENY7dGRQbGmBhLeP9wJBXHsNhObWKV71BrPm2wSOJLrFU2iLa5jLY7mkz7xKhq3e9dDttus9c6A0KPj1f54YAsvZ_SEPbE1WBVzMYPD3MV-6yw2KbKgZxYQ9A0lf87KoffIbA24Y2S97FBuOWJ5ZVN2rz02PbpXyuMf1fcnUb8JpAm6ewwArKqtmIJg20hySiYOtZUgfQvjwBaDrMhQjKGKYiLXIEdGTWVQuuTGQhG8pqd4StbxUsCwdMiFOFVXV0mNNncz3QZEOPF5fgW564KuE9qFClhq620ve61mgg6_3S2kQ9RhHYaShvuI",
"url": "https://drive.usercontent.google.com/download?id=1X2XTU2ZFuMXOhm7T400e6AOe6eBYxWzD&export=download&authuser=0&confirm=t&uuid=1318923f-6d54-4148-aa80-a454b9963cec&at=APZUnTU-h1nmcjBO_ytWVxXuh8l9:1704187013730",
"path": "Desktop/Set_Default_Font.docx"
}
]
@@ -28,10 +28,15 @@
],
"evaluator": {
"func": "find_default_font",
"expected": "Times New Roman",
"expected": {
"type": "rule",
"rules": {
"font_name": "Times New Roman"
}
},
"result": {
"type": "vm_file",
"path": "/home/[your-username]/.config/libreoffice/4/user/registrymodifications.xcu",
"path": "/home/user/.config/libreoffice/4/user/registrymodifications.xcu",
"dest": "registrymodifications.xcu"
}
}

View File

@@ -3,10 +3,9 @@
import os
import re
import base64
from desktop_env.envs.desktop_env import Action, MouseClick
import PIL.Image
import json
import requests
from mm_agents.gpt_4v_prompt import SYS_PROMPT
import torch
import argparse
@@ -15,7 +14,7 @@ import argparse
from seem.modeling.BaseModel import BaseModel as BaseModel_Seem
from seem.utils.distributed import init_distributed as init_distributed_seem
from seem.modeling import build_model as build_model_seem
from task_adapter.seem.tasks import interactive_seem_m2m_auto, inference_seem_pano, inference_seem_interactive
from task_adapter.seem.tasks import inference_seem_pano
# semantic sam
from semantic_sam.BaseModel import BaseModel
@@ -28,14 +27,42 @@ from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_sw
# sam
from segment_anything import sam_model_registry
from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto
from task_adapter.sam.tasks.inference_sam_m2m_interactive import inference_sam_m2m_interactive
from scipy.ndimage import label
from io import BytesIO
import numpy as np
SYS_PROMPT = '''
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
Firstly you need to predict the class of your action, select from one below:
- **CLICK**: click on the screen with the specified integer label
- **TYPE**: type a string on the keyboard
- For CLICK, you need to predict the correct integer label shown on the screenshot
for example, format as:
```
{
"action_type": "CLICK",
"label": 7
}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
{
"action_type": "TYPE",
"text": "hello world"
}
```
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
You can predict multiple actions at one step, but you should only return one action for each step.
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
'''
build args
'''
# build args
semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml"
seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml"
@@ -47,9 +74,7 @@ opt_semsam = load_opt_from_config_file(semsam_cfg)
opt_seem = load_opt_from_config_file(seem_cfg)
opt_seem = init_distributed_seem(opt_seem)
'''
build model
'''
# build model
model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda()
model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda()
model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda()
@@ -65,65 +90,54 @@ def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs
elif slider > 2.5:
model_name = 'sam'
else:
if mode == 'Automatic':
model_name = 'semantic-sam'
if slider < 1.5 + 0.14:
level = [1]
elif slider < 1.5 + 0.28:
level = [2]
elif slider < 1.5 + 0.42:
level = [3]
elif slider < 1.5 + 0.56:
level = [4]
elif slider < 1.5 + 0.70:
level = [5]
elif slider < 1.5 + 0.84:
level = [6]
else:
level = [6, 1, 2, 3, 4, 5]
model_name = 'semantic-sam'
if slider < 1.5 + 0.14:
level = [1]
elif slider < 1.5 + 0.28:
level = [2]
elif slider < 1.5 + 0.42:
level = [3]
elif slider < 1.5 + 0.56:
level = [4]
elif slider < 1.5 + 0.70:
level = [5]
elif slider < 1.5 + 0.84:
level = [6]
else:
model_name = 'sam'
level = [6, 1, 2, 3, 4, 5]
if label_mode == 'Alphabet':
label_mode = 'a'
else:
label_mode = '1'
text_size, hole_scale, island_scale = 640, 100, 100
text_size, hole_scale, island_scale = 1280, 100, 100
text, text_part, text_thresh = '', '', '0.0'
with torch.autocast(device_type='cuda', dtype=torch.float16):
semantic = False
if mode == "Interactive":
labeled_array, num_features = label(np.asarray(image['mask'].convert('L')))
spatial_masks = torch.stack([torch.from_numpy(labeled_array == i+1) for i in range(num_features)])
if model_name == 'semantic-sam':
model = model_semsam
output, mask = inference_semsam_m2m_auto(model, image['image'], level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
output, mask = inference_semsam_m2m_auto(model, image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
elif model_name == 'sam':
model = model_sam
if mode == "Automatic":
output, mask = inference_sam_m2m_auto(model, image['image'], text_size, label_mode, alpha, anno_mode)
elif mode == "Interactive":
output, mask = inference_sam_m2m_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
output, mask = inference_sam_m2m_auto(model, image, text_size, label_mode, alpha, anno_mode)
elif model_name == 'seem':
model = model_seem
if mode == "Automatic":
output, mask = inference_seem_pano(model, image['image'], text_size, label_mode, alpha, anno_mode)
elif mode == "Interactive":
output, mask = inference_seem_interactive(model, image['image'], spatial_masks, text_size, label_mode, alpha, anno_mode)
output, mask = inference_seem_pano(model, image, text_size, label_mode, alpha, anno_mode)
return output
return output, mask
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def encode_image(image):
pil_img = PIL.Image.fromarray(image)
buff = BytesIO()
pil_img.save(buff, format="JPEG")
new_image_string = base64.b64encode(buff.getvalue()).decode("utf-8")
return new_image_string
def parse_actions_from_string(input_string):
# Search for a JSON string within the input string
@@ -156,7 +170,6 @@ def parse_actions_from_string(input_string):
except json.JSONDecodeError as e:
raise ValueError("Invalid response format: " + input_string)
class GPT4v_Agent:
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
self.instruction = instruction
@@ -181,7 +194,8 @@ class GPT4v_Agent:
]
def predict(self, obs):
obs = inference(obs, slider=2.0, mode="Automatic", alpha=0.1, label_mode="Alphabet", anno_mode=["Mask", "Mark"])
obs, mask = inference(obs, slider=3.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"])
PIL.Image.fromarray(obs).save("desktop.jpeg")
base64_image = encode_image(obs)
self.trajectory.append({
"role": "user",
@@ -212,14 +226,14 @@ class GPT4v_Agent:
response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
try:
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
actions = self.parse_actions(response.json()['choices'][0]['message']['content'], mask)
except:
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None
return actions
def parse_actions(self, response: str):
def parse_actions(self, response: str, mask):
# response example
"""
```json
@@ -232,6 +246,7 @@ class GPT4v_Agent:
# parse from the response
actions = parse_actions_from_string(response)
print(actions)
# add action into the trajectory
self.trajectory.append({
@@ -247,24 +262,14 @@ class GPT4v_Agent:
# parse action
parsed_actions = []
for action in actions:
parsed_action = {}
action_type = Action[action['action_type']].value
parsed_action["action_type"] = action_type
action_type = action['action_type']
if action_type == "CLICK":
label = int(action['label'])
x, y, w, h = mask[label-1]['bbox']
parsed_actions.append({"action_type": action_type, "x": int(x + w//2) , "y": int(y + h//2)})
if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
parsed_action["click_type"] = MouseClick[action['click_type']].value
if action_type == Action.MOUSE_MOVE.value:
parsed_action["x"] = action["x"]
parsed_action["y"] = action["y"]
if action_type == Action.KEY.value:
parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys
if action_type == Action.TYPE.value:
parsed_action["text"] = action["text"]
parsed_actions.append(parsed_action)
if action_type == "TYPE":
parsed_actions.append({"action_type": action_type, "text": action["text"]})
return parsed_actions
@@ -273,5 +278,6 @@ if __name__ == '__main__':
# OpenAI API Key
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
print(agent.predict(obs="stackoverflow.png"))
agent = GPT4v_Agent(api_key=api_key, instruction="Open Firefox")
obs = PIL.Image.open('desktop.png')
print(agent.predict(obs=obs))

Binary file not shown.

Before

Width:  |  Height:  |  Size: 16 MiB

View File

@@ -0,0 +1,401 @@
# --------------------------------------------------------
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
# Copyright (c) 2022 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Xueyan Zou (xueyan@cs.wisc.edu)
# --------------------------------------------------------
# Define Test/Trainer/Saving
PIPELINE: XDecoderPipeline
TRAINER: xdecoder
SAVE_DIR: '../../data/output/test'
base_path: "./"
# Resume Logistic
RESUME: false
WEIGHT: false
RESUME_FROM: ''
EVAL_AT_START: False
# Logging and Debug
WANDB: False
LOG_EVERY: 100
FIND_UNUSED_PARAMETERS: false
# Speed up training
FP16: false
PORT: '36873'
# misc
LOADER:
JOINT: False
KEY_DATASET: 'coco'
##################
# Task settings
##################
VERBOSE: true
MODEL:
NAME: seem_model_v1
HEAD: xdecoder_head
MASK_ON: false
KEYPOINT_ON: false
LOAD_PROPOSALS: false
DIM_PROJ: 512
TEXT:
ARCH: vlpencoder
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 77 # 77
WIDTH: 512
HEADS: 8
LAYERS: 12 # 6
AUTOGRESSIVE: True
BACKBONE:
NAME: focal
PRETRAINED: ''
LOAD_PRETRAINED: false
FOCAL:
PRETRAIN_IMG_SIZE: 224
PATCH_SIZE: 4
EMBED_DIM: 192
DEPTHS: [2, 2, 18, 2]
FOCAL_LEVELS: [4, 4, 4, 4]
FOCAL_WINDOWS: [3, 3, 3, 3]
DROP_PATH_RATE: 0.3
MLP_RATIO: 4.0
DROP_RATE: 0.0
PATCH_NORM: True
USE_CONV_EMBED: True
SCALING_MODULATOR: True
USE_CHECKPOINT: False
USE_POSTLN: true
USE_POSTLN_IN_MODULATION: false
USE_LAYERSCALE: True
OUT_FEATURES: ["res2", "res3", "res4", "res5"]
OUT_INDICES: [0, 1, 2, 3]
ENCODER:
NAME: transformer_encoder_fpn
IGNORE_VALUE: 255
NUM_CLASSES: 133
LOSS_WEIGHT: 1.0
CONVS_DIM: 512
MASK_DIM: 512
NORM: "GN"
IN_FEATURES: ["res2", "res3", "res4", "res5"]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
DECODER:
NAME: seem_v1
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK:
ENABLED: True
DETECTION: False
SPATIAL:
ENABLED: True
MAX_ITER: 1
GROUNDING:
ENABLED: True
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
RETRIEVAL:
ENABLED: False
LVIS:
ENABLED: True
THRES: 0.7
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.5
SIM_THRES: 0.95
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
GCLASS_WEIGHT: 0.4
GMASK_WEIGHT: 1.0
GDICE_WEIGHT: 1.0
SCLASS_WEIGHT: 0.4
SMASK_WEIGHT: 1.0
SDICE_WEIGHT: 1.0
OCLASS_WEIGHT: 0.4
OMASK_WEIGHT: 1.0
ODICE_WEIGHT: 1.0
CLASS_WEIGHT: 2.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BBOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
CAPTION_WEIGHT: 2.0
COST_SPATIAL:
CLASS_WEIGHT: 5.0
MASK_WEIGHT: 2.0
DICE_WEIGHT: 2.0
HIDDEN_DIM: 512
NUM_OBJECT_QUERIES: 101
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
MAX_SPATIAL_LEN: [512, 512, 512, 512]
# ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
DEC_LAYERS: 10 # 9 decoder layers, add one for the loss on learnable query
TOP_GROUNDING_LAYERS: 10
TOP_CAPTION_LAYERS: 10
TOP_SPATIAL_LAYERS: 10
TOP_OPENIMAGE_LAYERS: 10
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.8
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
# Spatial sampler
STROKE_SAMPLER:
MAX_CANDIDATE: 1
CANDIDATE_PROBS: [0.25, 0.25, 0.25, 0.25] # for training only
CANDIDATE_NAMES: ["Point", "Polygon", "Scribble", "Circle"]
DILATION: 3
CIRCLE:
NUM_STROKES: 5
STROKE_PRESET: ['object_like', 'object_like_middle', 'object_like_small']
STROKE_PROB: [0.33, 0.33, 0.33]
SCRIBBLE:
NUM_STROKES: 5
STROKE_PRESET: ['rand_curve', 'rand_curve_small']
STROKE_PROB: [0.5, 0.5]
POINT:
NUM_POINTS: 20
POLYGON:
MAX_POINTS: 9
EVAL:
MODE: 'best' # best/random/best_random
NEGATIVE: False
MAX_ITER: 20
IOU_ITER: 1
GROUNDING: False
# Multi-modal Architecture, order matters
ATTENTION_ARCH:
VARIABLE:
queries: ['object', 'grounding', 'spatial']
tokens: ['grounding', 'spatial']
memories: ['spatial']
SELF_ATTENTION:
queries:
object: ['queries_object']
grounding: ['queries_grounding', 'tokens_grounding']
spatial: ['queries_spatial', 'tokens_spatial', 'memories_spatial']
tokens:
grounding: ['queries_grounding', 'tokens_grounding']
spatial: ['tokens_spatial']
memories:
spatial: ['memories_spatial']
CROSS_ATTENTION:
queries:
object: True
grounding: True
spatial: True
memories:
spatial: True
tokens:
grounding: False
spatial: False
MASKING: ['tokens_spatial', 'tokens_grounding']
DUPLICATION:
queries:
grounding: 'queries_object'
spatial: 'queries_object'
SPATIAL_MEMORIES: 32
QUERY_NUMBER: 3
DATASETS:
TRAIN: ["coco_2017_train_panoptic_filtrefgumdval_with_sem_seg_caption_grounding_lvis",]
# TRAIN: ["coco_2017_train_panoptic_with_sem_seg_caption_grounding",]
TEST: ["coco_2017_val_panoptic_with_sem_seg", "pascalvoc_val_Point", "refcocog_val_umd"] # to evaluate instance and semantic performance as well
# TEST: ["pascalvoc_val_Point"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["cocomini_val_Point", "cocomini_val_Circle", "cocomini_val_Scribble", "cocomini_val_Polygon", "cocomini_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["ade600_val_Point", "ade600_val_Circle", "ade600_val_Scribble", "ade600_val_Polygon", "ade600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
# TEST: ["openimage600_val_Point", "openimage600_val_Circle", "openimage600_val_Scribble", "openimage600_val_Polygon", "openimage600_val_Box"] # [pascalvoc, openimage600, ade600, davis, cocomini], [Point, Scribble, Polygon, Circle, Box]
CLASS_CONCAT: false
SIZE_DIVISIBILITY: 32
PROPOSAL_FILES_TRAIN: []
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 4
BATCH_SIZE_PER_GPU: 4
SHUFFLE: true
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 8
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
COCO:
INPUT:
MIN_SIZE_TRAIN: 800
MAX_SIZE_TRAIN: 1333
MIN_SIZE_TRAIN_SAMPLING: 'choice'
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "coco_interactive"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'coco'
# Validation dataset
ADE20K:
INPUT:
MIN_SIZE_TRAIN: 640
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TRAIN: 2560
MAX_SIZE_TEST: 2560
MASK_FORMAT: "polygon"
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: (640, 640)
SINGLE_CATEGORY_MAX_AREA: 1.0
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 640 # used in dataset mapper
DATASET_MAPPER_NAME: "mask_former_panoptic"
FORMAT: "RGB"
DATASET:
DATASET: 'ade'
SBD:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1
VOC:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
DAVIS:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
VOS:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 1
REF:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
FORMAT: "RGB"
SPATIAL: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
# Detectron2 training config for optimizer and lr scheduler
SOLVER:
BASE_LR: 0.0001
STEPS: [0.88889, 0.96296]
MAX_ITER: 1
GAMMA: 0.1
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WARMUP_METHOD: "linear"
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
LR_MULTIPLIER:
backbone: 0.1
lang_encoder: 0.1
FIX_PARAM:
backbone: True
lang_encoder: True
pixel_decoder: True
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_EMBED: 0.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 5.0 # 0.01
NORM_TYPE: 2.0
MAX_NUM_EPOCHS: 50

View File

@@ -0,0 +1,524 @@
# ------------------------------------------------------------------------
# Semantic SAM
# Copyright (c) MicroSoft, Inc. and its affiliates.
# Modified from OpenSeed https://github.com/IDEA-Research/OpenSeed by Feng Li.
# ------------------------------------------------------------------------
##################
# Task settings
##################
WEIGHT: ''
PORT: 53711
VERBOSE: true
OUTPUT_DIR: '../../data/output/test'
# misc
LOADER:
JOINT: True
KEY_DATASET: 'coco'
# model
MODEL:
NAME: interactive_mask_dino
HEAD: general_head
MASK_ON: false
KEYPOINT_ON: false
LOAD_PROPOSALS: false
DIM_PROJ: 512
BACKBONE_DIM: 768
BACKGROUND: False
WEIGHTS: ''
TEXT:
ARCH: noencoder # no language encoder for training only sa-1b data
NAME: transformer
TOKENIZER: clip
CONTEXT_LENGTH: 18 # 77
WIDTH: 512
HEADS: 8
LAYERS: 12 # 6
AUTOGRESSIVE: True
BACKBONE:
NAME: swin
PRETRAINED: 'https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_large_patch4_window12_384_22k.pth'
LOAD_PRETRAINED: true
SWIN:
PRETRAIN_IMG_SIZE: 384
PATCH_SIZE: 4
EMBED_DIM: 192
DEPTHS: [ 2, 2, 18, 2 ]
NUM_HEADS: [ 6, 12, 24, 48 ]
WINDOW_SIZE: 12
MLP_RATIO: 4.0
QKV_BIAS: true
QK_SCALE: ~
DROP_RATE: 0.0
ATTN_DROP_RATE: 0.0
DROP_PATH_RATE: 0.3
APE: false
PATCH_NORM: true
USE_CHECKPOINT: false
OUT_FEATURES: [ 'res2', 'res3', 'res4', 'res5' ]
ENCODER:
NAME: encoder_deform
IGNORE_VALUE: 255
NUM_CLASSES: 1
LOSS_WEIGHT: 1.0
CONVS_DIM: 256
MASK_DIM: 256
NORM: "GN"
IN_FEATURES: [ "res2", "res3", "res4", "res5" ]
DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: [ "res3", "res4", "res5" ]
COMMON_STRIDE: 4
TRANSFORMER_ENC_LAYERS: 6
TOTAL_NUM_FEATURE_LEVELS: 4
NUM_FEATURE_LEVELS: 3
FEATURE_ORDER: "low2high"
DECODER:
NAME: interactive_mask_dino
TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
MASK: True
BOX: True
PART: True
GROUNDING:
ENABLED: False
MAX_LEN: 5
TEXT_WEIGHT: 2.0
CLASS_WEIGHT: 0.5
CAPTION:
ENABLED: False
PHRASE_PROB: 0.0
SIM_THRES: 0.95
CAPTIONING:
ENABLED: False
STEP: 50
RETRIEVAL:
ENABLED: False
DIM_IMG: 768
ENSEMBLE: True
OPENIMAGE:
ENABLED: False
NEGATIVE_SAMPLES: 5
GROUNDING:
ENABLED: False
MAX_LEN: 5
DEEP_SUPERVISION: True
NO_OBJECT_WEIGHT: 0.1
CLASS_WEIGHT: 4.0
MASK_WEIGHT: 5.0
DICE_WEIGHT: 5.0
BOX_WEIGHT: 5.0
GIOU_WEIGHT: 2.0
IOU_WEIGHT: 1.0
COST_CLASS_WEIGHT: 4.0
COST_DICE_WEIGHT: 5.0
COST_MASK_WEIGHT: 5.0
COST_BOX_WEIGHT: 5.0
COST_GIOU_WEIGHT: 2.0
HIDDEN_DIM: 256
NUM_OBJECT_QUERIES: 0
NHEADS: 8
DROPOUT: 0.0
DIM_FEEDFORWARD: 2048
ENC_LAYERS: 0
PRE_NORM: False
ENFORCE_INPUT_PROJ: False
SIZE_DIVISIBILITY: 32
DEC_LAYERS: 9 # 9 decoder layers, add one for the loss on learnable query
TRAIN_NUM_POINTS: 12544
OVERSAMPLE_RATIO: 3.0
IMPORTANCE_SAMPLE_RATIO: 0.75
TWO_STAGE: False
INITIALIZE_BOX_TYPE: 'no'
DN: seg
DN_NOISE_SCALE: 0.4
DN_NUM: 100
INITIAL_PRED: False
LEARN_TGT: False
TOTAL_NUM_FEATURE_LEVELS: 4
SEMANTIC_CE_LOSS: False
PANO_BOX_LOSS: False
COCO: False
O365: False
SAM: True
PASCAL: False
RE_POINT: True
NUM_INTERACTIVE_TOKENS: 6
MAX_NUM_INSTANCE: 60
TEST:
SEMANTIC_ON: True
INSTANCE_ON: True
PANOPTIC_ON: True
BOX_INTERACTIVE: False
CLASSIFICATION_ON: False
OVERLAP_THRESHOLD: 0.8
OBJECT_MASK_THRESHOLD: 0.25
SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
TEST_FOUCUS_ON_BOX: False
PANO_TRANSFORM_EVAL: True
PANO_TEMPERATURE: 0.06
TEST:
EVAL_PERIOD: 500000
PRECISE_BN:
NUM_ITER: 1
ENABLED: False
AUG:
ENABLED: False
SAM:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.99
MAX_SCALE: 1.01
DATASET_MAPPER_NAME: "sam"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'sam'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 1
BATCH_SIZE_PER_GPU: 1
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 4
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
COCO:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "coco_interactive_panoptic_lsj"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
DATASET:
DATASET: 'coco'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 1
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 1
BATCH_SIZE_PER_GPU: 1
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
VLP:
INPUT:
IMAGE_SIZE: 224
DATASET_MAPPER_NAME: "vlpretrain"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
TRAIN:
BATCH_SIZE_TOTAL: 2
BATCH_SIZE_PER_GPU: 2
TEST:
BATCH_SIZE_TOTAL: 256
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 16
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
DATASETS:
TRAIN: ["sam_train"]
# interactive segmentation evaluation.
TEST: ["coco_2017_val_panoptic_with_sem_seg_interactive_jointboxpoint"]
# TEST: ["sam_minival"]
CLASS_CONCAT: false
SIZE_DIVISIBILITY: 32
PROPOSAL_FILES_TRAIN: []
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 16
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
# Detectron2 training config for optimizer and lr scheduler
SOLVER:
BASE_LR_END: 0.0
MOMENTUM: 0.9
NESTEROV: False
CHECKPOINT_PERIOD: 5000
IMS_PER_BATCH: 1
REFERENCE_WORLD_SIZE: 0
BIAS_LR_FACTOR: 1.0
WEIGHT_DECAY_BIAS: None
# original
BASE_LR: 0.0001
STEPS: [327778, 355092]
MAX_ITER: 368750
GAMMA: 0.1
WARMUP_FACTOR: 1.0
WARMUP_ITERS: 10
WARMUP_METHOD: "linear"
WEIGHT_DECAY: 0.05
OPTIMIZER: "ADAMW"
LR_SCHEDULER_NAME: "WarmupMultiStepLR"
LR_MULTIPLIER:
backbone: 0.1
lang_encoder: 0.1
WEIGHT_DECAY_NORM: 0.0
WEIGHT_DECAY_EMBED: 0.0
CLIP_GRADIENTS:
ENABLED: True
CLIP_TYPE: "full_model"
CLIP_VALUE: 0.01
NORM_TYPE: 2.0
AMP:
ENABLED: True
# Evaluation Dataset
ADE20K:
INPUT:
MIN_SIZE_TRAIN: [320, 384, 448, 512, 576, 640, 704, 768, 832, 896, 960, 1024, 1088, 1152, 1216, 1280]
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 640
MAX_SIZE_TRAIN: 2560
MAX_SIZE_TEST: 2560
MASK_FORMAT: "polygon"
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: [640, 640]
SINGLE_CATEGORY_MAX_AREA: 1.0
IGNORE_VALUE: 255
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: 640 # used in dataset mapper
DATASET_MAPPER_NAME: "mask_former_panoptic"
FORMAT: "RGB"
DATASET:
DATASET: 'ade'
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 16
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 8
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
#ADE20K:
# INPUT:
# MIN_SIZE_TRAIN: 640
# MIN_SIZE_TRAIN_SAMPLING: "choice"
# MIN_SIZE_TEST: 640
# MAX_SIZE_TRAIN: 2560
# MAX_SIZE_TEST: 2560
# MASK_FORMAT: "polygon"
# CROP:
# ENABLED: True
# TYPE: "absolute"
# SIZE: (640, 640)
# SINGLE_CATEGORY_MAX_AREA: 1.0
# COLOR_AUG_SSD: True
# SIZE_DIVISIBILITY: 640 # used in dataset mapper
# DATASET_MAPPER_NAME: "mask_former_panoptic"
# FORMAT: "RGB"
# DATASET:
# DATASET: 'ade'
# TEST:
# BATCH_SIZE_TOTAL: 8
REF:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
FORMAT: "RGB"
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
SUN:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
SCAN:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 512
MAX_SIZE_TEST: 1024
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
BDD:
INPUT:
PIXEL_MEAN: [123.675, 116.280, 103.530]
PIXEL_STD: [58.395, 57.120, 57.375]
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 0
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: False
TEST:
BATCH_SIZE_TOTAL: 8
CITY:
INPUT:
MIN_SIZE_TRAIN: [ 512, 614, 716, 819, 921, 1024, 1126, 1228, 1331, 1433, 1536, 1638, 1740, 1843, 1945, 2048 ]
MIN_SIZE_TRAIN_SAMPLING: "choice"
MIN_SIZE_TEST: 1024
MAX_SIZE_TRAIN: 4096
MAX_SIZE_TEST: 2048
CROP:
ENABLED: True
TYPE: "absolute"
SIZE: [ 512, 1024 ]
SINGLE_CATEGORY_MAX_AREA: 1.0
IGNORE_VALUE: 255
COLOR_AUG_SSD: True
SIZE_DIVISIBILITY: -1
FORMAT: "RGB"
DATASET_MAPPER_NAME: "mask_former_panoptic"
MASK_FORMAT: "polygon"
TEST:
EVAL_PERIOD: 5000
BATCH_SIZE_TOTAL: 1
AUG:
ENABLED: False
MIN_SIZES: [ 512, 768, 1024, 1280, 1536, 1792 ]
MAX_SIZE: 4096
FLIP: True
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: True
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True
TRAIN:
ASPECT_RATIO_GROUPING: true
BATCH_SIZE_TOTAL: 2
BATCH_SIZE_PER_GPU: 2
SHUFFLE: true
PSACAL_PART:
INPUT:
MIN_SIZE_TEST: 800
MAX_SIZE_TEST: 1333
IMAGE_SIZE: 1024
MIN_SCALE: 0.1
MAX_SCALE: 2.0
DATASET_MAPPER_NAME: "pascal_part_lsj"
IGNORE_VALUE: 255
COLOR_AUG_SSD: False
SIZE_DIVISIBILITY: 32
RANDOM_FLIP: "horizontal"
MASK_FORMAT: "polygon"
FORMAT: "RGB"
CROP:
ENABLED: True
MODEL:
MASK_ON: True
KEYPOINT_ON: False
LOAD_PROPOSALS: False
# DATASET:
# DATASET: 'coco'
TEST:
DETECTIONS_PER_IMAGE: 100
NAME: coco_eval
IOU_TYPE: ['bbox', 'segm']
USE_MULTISCALE: false
BATCH_SIZE_TOTAL: 8
MODEL_FILE: ''
AUG:
ENABLED: False
TRAIN:
BATCH_SIZE_TOTAL: 1
BATCH_SIZE_PER_GPU: 1
SHUFFLE: true
DATALOADER:
FILTER_EMPTY_ANNOTATIONS: False
NUM_WORKERS: 2
LOAD_PROPOSALS: False
SAMPLER_TRAIN: "TrainingSampler"
ASPECT_RATIO_GROUPING: True

BIN
mm_agents/desktop.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.5 MiB

View File

@@ -0,0 +1,3 @@
wget https://github.com/UX-Decoder/Semantic-SAM/releases/download/checkpoint/swinl_only_sam_many2many.pth
wget https://huggingface.co/xdecoder/SEEM/resolve/main/seem_focall_v1.pt
wget https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth

19
mm_agents/gemini_test.py Normal file
View File

@@ -0,0 +1,19 @@
import PIL.Image
import google.generativeai as genai
genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q")
# for m in genai.list_models():
# if 'generateContent' in m.supported_generation_methods:
# print(m.name)
model = genai.GenerativeModel('gemini-pro-vision')
img = PIL.Image.open('image.jpg')
messages = [
{'role':'user',
'parts': ["Explain this image.", img]}
]
response = model.generate_content(messages)

View File

@@ -0,0 +1,13 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn_func import MSDeformAttnFunction

View File

@@ -0,0 +1,72 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import torch
import torch.nn.functional as F
from torch.autograd import Function
from torch.autograd.function import once_differentiable
try:
import MultiScaleDeformableAttention as MSDA
except ModuleNotFoundError as e:
info_string = (
"\n\nPlease compile MultiScaleDeformableAttention CUDA op with the following commands:\n"
"\t`cd mask2former/modeling/pixel_decoder/ops`\n"
"\t`sh make.sh`\n"
)
raise ModuleNotFoundError(info_string)
class MSDeformAttnFunction(Function):
@staticmethod
def forward(ctx, value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, im2col_step):
ctx.im2col_step = im2col_step
output = MSDA.ms_deform_attn_forward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, ctx.im2col_step)
ctx.save_for_backward(value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights)
return output
@staticmethod
@once_differentiable
def backward(ctx, grad_output):
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights = ctx.saved_tensors
grad_value, grad_sampling_loc, grad_attn_weight = \
MSDA.ms_deform_attn_backward(
value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights, grad_output, ctx.im2col_step)
return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
def ms_deform_attn_core_pytorch(value, value_spatial_shapes, sampling_locations, attention_weights):
# for debug and test only,
# need to use cuda version instead
N_, S_, M_, D_ = value.shape
_, Lq_, M_, L_, P_, _ = sampling_locations.shape
value_list = value.split([H_ * W_ for H_, W_ in value_spatial_shapes], dim=1)
sampling_grids = 2 * sampling_locations - 1
sampling_value_list = []
for lid_, (H_, W_) in enumerate(value_spatial_shapes):
# N_, H_*W_, M_, D_ -> N_, H_*W_, M_*D_ -> N_, M_*D_, H_*W_ -> N_*M_, D_, H_, W_
value_l_ = value_list[lid_].flatten(2).transpose(1, 2).reshape(N_*M_, D_, H_, W_)
# N_, Lq_, M_, P_, 2 -> N_, M_, Lq_, P_, 2 -> N_*M_, Lq_, P_, 2
sampling_grid_l_ = sampling_grids[:, :, :, lid_].transpose(1, 2).flatten(0, 1)
# N_*M_, D_, Lq_, P_
sampling_value_l_ = F.grid_sample(value_l_, sampling_grid_l_,
mode='bilinear', padding_mode='zeros', align_corners=False)
sampling_value_list.append(sampling_value_l_)
# (N_, Lq_, M_, L_, P_) -> (N_, M_, Lq_, L_, P_) -> (N_, M_, 1, Lq_, L_*P_)
attention_weights = attention_weights.transpose(1, 2).reshape(N_*M_, 1, Lq_, L_*P_)
output = (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights).sum(-1).view(N_, M_*D_, Lq_)
return output.transpose(1, 2).contiguous()

13
mm_agents/ops/make.sh Executable file
View File

@@ -0,0 +1,13 @@
#!/usr/bin/env bash
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
python setup.py build install

View File

@@ -0,0 +1,12 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from .ms_deform_attn import MSDeformAttn

View File

@@ -0,0 +1,125 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import warnings
import math
import torch
from torch import nn
import torch.nn.functional as F
from torch.nn.init import xavier_uniform_, constant_
from ..functions import MSDeformAttnFunction
from ..functions.ms_deform_attn_func import ms_deform_attn_core_pytorch
def _is_power_of_2(n):
if (not isinstance(n, int)) or (n < 0):
raise ValueError("invalid input for _is_power_of_2: {} (type: {})".format(n, type(n)))
return (n & (n-1) == 0) and n != 0
class MSDeformAttn(nn.Module):
def __init__(self, d_model=256, n_levels=4, n_heads=8, n_points=4):
"""
Multi-Scale Deformable Attention Module
:param d_model hidden dimension
:param n_levels number of feature levels
:param n_heads number of attention heads
:param n_points number of sampling points per attention head per feature level
"""
super().__init__()
if d_model % n_heads != 0:
raise ValueError('d_model must be divisible by n_heads, but got {} and {}'.format(d_model, n_heads))
_d_per_head = d_model // n_heads
# you'd better set _d_per_head to a power of 2 which is more efficient in our CUDA implementation
if not _is_power_of_2(_d_per_head):
warnings.warn("You'd better set d_model in MSDeformAttn to make the dimension of each attention head a power of 2 "
"which is more efficient in our CUDA implementation.")
self.im2col_step = 128
self.d_model = d_model
self.n_levels = n_levels
self.n_heads = n_heads
self.n_points = n_points
self.sampling_offsets = nn.Linear(d_model, n_heads * n_levels * n_points * 2)
self.attention_weights = nn.Linear(d_model, n_heads * n_levels * n_points)
self.value_proj = nn.Linear(d_model, d_model)
self.output_proj = nn.Linear(d_model, d_model)
self._reset_parameters()
def _reset_parameters(self):
constant_(self.sampling_offsets.weight.data, 0.)
thetas = torch.arange(self.n_heads, dtype=torch.float32) * (2.0 * math.pi / self.n_heads)
grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
grid_init = (grid_init / grid_init.abs().max(-1, keepdim=True)[0]).view(self.n_heads, 1, 1, 2).repeat(1, self.n_levels, self.n_points, 1)
for i in range(self.n_points):
grid_init[:, :, i, :] *= i + 1
with torch.no_grad():
self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
constant_(self.attention_weights.weight.data, 0.)
constant_(self.attention_weights.bias.data, 0.)
xavier_uniform_(self.value_proj.weight.data)
constant_(self.value_proj.bias.data, 0.)
xavier_uniform_(self.output_proj.weight.data)
constant_(self.output_proj.bias.data, 0.)
def forward(self, query, reference_points, input_flatten, input_spatial_shapes, input_level_start_index, input_padding_mask=None):
"""
:param query (N, Length_{query}, C)
:param reference_points (N, Length_{query}, n_levels, 2), range in [0, 1], top-left (0,0), bottom-right (1, 1), including padding area
or (N, Length_{query}, n_levels, 4), add additional (w, h) to form reference boxes
:param input_flatten (N, \sum_{l=0}^{L-1} H_l \cdot W_l, C)
:param input_spatial_shapes (n_levels, 2), [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
:param input_level_start_index (n_levels, ), [0, H_0*W_0, H_0*W_0+H_1*W_1, H_0*W_0+H_1*W_1+H_2*W_2, ..., H_0*W_0+H_1*W_1+...+H_{L-1}*W_{L-1}]
:param input_padding_mask (N, \sum_{l=0}^{L-1} H_l \cdot W_l), True for padding elements, False for non-padding elements
:return output (N, Length_{query}, C)
"""
N, Len_q, _ = query.shape
N, Len_in, _ = input_flatten.shape
assert (input_spatial_shapes[:, 0] * input_spatial_shapes[:, 1]).sum() == Len_in
value = self.value_proj(input_flatten)
if input_padding_mask is not None:
value = value.masked_fill(input_padding_mask[..., None], float(0))
value = value.view(N, Len_in, self.n_heads, self.d_model // self.n_heads)
sampling_offsets = self.sampling_offsets(query).view(N, Len_q, self.n_heads, self.n_levels, self.n_points, 2)
attention_weights = self.attention_weights(query).view(N, Len_q, self.n_heads, self.n_levels * self.n_points)
attention_weights = F.softmax(attention_weights, -1).view(N, Len_q, self.n_heads, self.n_levels, self.n_points)
# N, Len_q, n_heads, n_levels, n_points, 2
if reference_points.shape[-1] == 2:
offset_normalizer = torch.stack([input_spatial_shapes[..., 1], input_spatial_shapes[..., 0]], -1)
sampling_locations = reference_points[:, :, None, :, None, :] \
+ sampling_offsets / offset_normalizer[None, None, None, :, None, :]
elif reference_points.shape[-1] == 4:
sampling_locations = reference_points[:, :, None, :, None, :2] \
+ sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
else:
raise ValueError(
'Last dim of reference_points must be 2 or 4, but get {} instead.'.format(reference_points.shape[-1]))
try:
output = MSDeformAttnFunction.apply(
value, input_spatial_shapes, input_level_start_index, sampling_locations, attention_weights, self.im2col_step)
except:
# CPU
output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
# # For FLOPs calculation only
# output = ms_deform_attn_core_pytorch(value, input_spatial_shapes, sampling_locations, attention_weights)
output = self.output_proj(output)
return output

78
mm_agents/ops/setup.py Normal file
View File

@@ -0,0 +1,78 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
import os
import glob
import torch
from torch.utils.cpp_extension import CUDA_HOME
from torch.utils.cpp_extension import CppExtension
from torch.utils.cpp_extension import CUDAExtension
from setuptools import find_packages
from setuptools import setup
requirements = ["torch", "torchvision"]
def get_extensions():
this_dir = os.path.dirname(os.path.abspath(__file__))
extensions_dir = os.path.join(this_dir, "src")
main_file = glob.glob(os.path.join(extensions_dir, "*.cpp"))
source_cpu = glob.glob(os.path.join(extensions_dir, "cpu", "*.cpp"))
source_cuda = glob.glob(os.path.join(extensions_dir, "cuda", "*.cu"))
sources = main_file + source_cpu
extension = CppExtension
extra_compile_args = {"cxx": []}
define_macros = []
# Force cuda since torch ask for a device, not if cuda is in fact available.
if (os.environ.get('FORCE_CUDA') or torch.cuda.is_available()) and CUDA_HOME is not None:
extension = CUDAExtension
sources += source_cuda
define_macros += [("WITH_CUDA", None)]
extra_compile_args["nvcc"] = [
"-DCUDA_HAS_FP16=1",
"-D__CUDA_NO_HALF_OPERATORS__",
"-D__CUDA_NO_HALF_CONVERSIONS__",
"-D__CUDA_NO_HALF2_OPERATORS__",
]
else:
if CUDA_HOME is None:
raise NotImplementedError('CUDA_HOME is None. Please set environment variable CUDA_HOME.')
else:
raise NotImplementedError('No CUDA runtime is found. Please set FORCE_CUDA=1 or test it by running torch.cuda.is_available().')
sources = [os.path.join(extensions_dir, s) for s in sources]
include_dirs = [extensions_dir]
ext_modules = [
extension(
"MultiScaleDeformableAttention",
sources,
include_dirs=include_dirs,
define_macros=define_macros,
extra_compile_args=extra_compile_args,
)
]
return ext_modules
setup(
name="MultiScaleDeformableAttention",
version="1.0",
author="Weijie Su",
url="https://github.com/fundamentalvision/Deformable-DETR",
description="PyTorch Wrapper for CUDA Functions of Multi-Scale Deformable Attention",
packages=find_packages(exclude=("configs", "tests",)),
ext_modules=get_extensions(),
cmdclass={"build_ext": torch.utils.cpp_extension.BuildExtension},
)

View File

@@ -0,0 +1,46 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include <vector>
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}
std::vector<at::Tensor>
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ERROR("Not implement on cpu");
}

View File

@@ -0,0 +1,38 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include <torch/extension.h>
at::Tensor
ms_deform_attn_cpu_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector<at::Tensor>
ms_deform_attn_cpu_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);

View File

@@ -0,0 +1,158 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include <vector>
#include "cuda/ms_deform_im2col_cuda.cuh"
#include <ATen/ATen.h>
#include <ATen/cuda/CUDAContext.h>
#include <cuda.h>
#include <cuda_runtime.h>
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto output = at::zeros({batch, num_query, num_heads, channels}, value.options());
const int batch_n = im2col_step_;
auto output_n = output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto columns = output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_forward_cuda", ([&] {
ms_deformable_im2col_cuda(at::cuda::getCurrentCUDAStream(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size,
spatial_shapes.data<int64_t>(),
level_start_index.data<int64_t>(),
sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
columns.data<scalar_t>());
}));
}
output = output.view({batch, num_query, num_heads*channels});
return output;
}
std::vector<at::Tensor> ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
AT_ASSERTM(value.is_contiguous(), "value tensor has to be contiguous");
AT_ASSERTM(spatial_shapes.is_contiguous(), "spatial_shapes tensor has to be contiguous");
AT_ASSERTM(level_start_index.is_contiguous(), "level_start_index tensor has to be contiguous");
AT_ASSERTM(sampling_loc.is_contiguous(), "sampling_loc tensor has to be contiguous");
AT_ASSERTM(attn_weight.is_contiguous(), "attn_weight tensor has to be contiguous");
AT_ASSERTM(grad_output.is_contiguous(), "grad_output tensor has to be contiguous");
AT_ASSERTM(value.type().is_cuda(), "value must be a CUDA tensor");
AT_ASSERTM(spatial_shapes.type().is_cuda(), "spatial_shapes must be a CUDA tensor");
AT_ASSERTM(level_start_index.type().is_cuda(), "level_start_index must be a CUDA tensor");
AT_ASSERTM(sampling_loc.type().is_cuda(), "sampling_loc must be a CUDA tensor");
AT_ASSERTM(attn_weight.type().is_cuda(), "attn_weight must be a CUDA tensor");
AT_ASSERTM(grad_output.type().is_cuda(), "grad_output must be a CUDA tensor");
const int batch = value.size(0);
const int spatial_size = value.size(1);
const int num_heads = value.size(2);
const int channels = value.size(3);
const int num_levels = spatial_shapes.size(0);
const int num_query = sampling_loc.size(1);
const int num_point = sampling_loc.size(4);
const int im2col_step_ = std::min(batch, im2col_step);
AT_ASSERTM(batch % im2col_step_ == 0, "batch(%d) must divide im2col_step(%d)", batch, im2col_step_);
auto grad_value = at::zeros_like(value);
auto grad_sampling_loc = at::zeros_like(sampling_loc);
auto grad_attn_weight = at::zeros_like(attn_weight);
const int batch_n = im2col_step_;
auto per_value_size = spatial_size * num_heads * channels;
auto per_sample_loc_size = num_query * num_heads * num_levels * num_point * 2;
auto per_attn_weight_size = num_query * num_heads * num_levels * num_point;
auto grad_output_n = grad_output.view({batch/im2col_step_, batch_n, num_query, num_heads, channels});
for (int n = 0; n < batch/im2col_step_; ++n)
{
auto grad_output_g = grad_output_n.select(0, n);
AT_DISPATCH_FLOATING_TYPES(value.type(), "ms_deform_attn_backward_cuda", ([&] {
ms_deformable_col2im_cuda(at::cuda::getCurrentCUDAStream(),
grad_output_g.data<scalar_t>(),
value.data<scalar_t>() + n * im2col_step_ * per_value_size,
spatial_shapes.data<int64_t>(),
level_start_index.data<int64_t>(),
sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size,
batch_n, spatial_size, num_heads, channels, num_levels, num_query, num_point,
grad_value.data<scalar_t>() + n * im2col_step_ * per_value_size,
grad_sampling_loc.data<scalar_t>() + n * im2col_step_ * per_sample_loc_size,
grad_attn_weight.data<scalar_t>() + n * im2col_step_ * per_attn_weight_size);
}));
}
return {
grad_value, grad_sampling_loc, grad_attn_weight
};
}

View File

@@ -0,0 +1,35 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include <torch/extension.h>
at::Tensor ms_deform_attn_cuda_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step);
std::vector<at::Tensor> ms_deform_attn_cuda_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step);

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,67 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#pragma once
#include "cpu/ms_deform_attn_cpu.h"
#ifdef WITH_CUDA
#include "cuda/ms_deform_attn_cuda.h"
#endif
at::Tensor
ms_deform_attn_forward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_forward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}
std::vector<at::Tensor>
ms_deform_attn_backward(
const at::Tensor &value,
const at::Tensor &spatial_shapes,
const at::Tensor &level_start_index,
const at::Tensor &sampling_loc,
const at::Tensor &attn_weight,
const at::Tensor &grad_output,
const int im2col_step)
{
if (value.type().is_cuda())
{
#ifdef WITH_CUDA
return ms_deform_attn_cuda_backward(
value, spatial_shapes, level_start_index, sampling_loc, attn_weight, grad_output, im2col_step);
#else
AT_ERROR("Not compiled with GPU support");
#endif
}
AT_ERROR("Not implemented on the CPU");
}

View File

@@ -0,0 +1,21 @@
/*!
**************************************************************************************************
* Deformable DETR
* Copyright (c) 2020 SenseTime. All Rights Reserved.
* Licensed under the Apache License, Version 2.0 [see LICENSE for details]
**************************************************************************************************
* Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
**************************************************************************************************
*/
/*!
* Copyright (c) Facebook, Inc. and its affiliates.
* Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
*/
#include "ms_deform_attn.h"
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
m.def("ms_deform_attn_forward", &ms_deform_attn_forward, "ms_deform_attn_forward");
m.def("ms_deform_attn_backward", &ms_deform_attn_backward, "ms_deform_attn_backward");
}

92
mm_agents/ops/test.py Normal file
View File

@@ -0,0 +1,92 @@
# ------------------------------------------------------------------------------------------------
# Deformable DETR
# Copyright (c) 2020 SenseTime. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 [see LICENSE for details]
# ------------------------------------------------------------------------------------------------
# Modified from https://github.com/chengdazhi/Deformable-Convolution-V2-PyTorch/tree/pytorch_1.0.0
# ------------------------------------------------------------------------------------------------
# Copyright (c) Facebook, Inc. and its affiliates.
# Modified by Bowen Cheng from https://github.com/fundamentalvision/Deformable-DETR
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import time
import torch
import torch.nn as nn
from torch.autograd import gradcheck
from functions.ms_deform_attn_func import MSDeformAttnFunction, ms_deform_attn_core_pytorch
N, M, D = 1, 2, 2
Lq, L, P = 2, 2, 2
shapes = torch.as_tensor([(6, 4), (3, 2)], dtype=torch.long).cuda()
level_start_index = torch.cat((shapes.new_zeros((1, )), shapes.prod(1).cumsum(0)[:-1]))
S = sum([(H*W).item() for H, W in shapes])
torch.manual_seed(3)
@torch.no_grad()
def check_forward_equal_with_pytorch_double():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value.double(), shapes, sampling_locations.double(), attention_weights.double()).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_double: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
@torch.no_grad()
def check_forward_equal_with_pytorch_float():
value = torch.rand(N, S, M, D).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
output_pytorch = ms_deform_attn_core_pytorch(value, shapes, sampling_locations, attention_weights).detach().cpu()
output_cuda = MSDeformAttnFunction.apply(value, shapes, level_start_index, sampling_locations, attention_weights, im2col_step).detach().cpu()
fwdok = torch.allclose(output_cuda, output_pytorch, rtol=1e-2, atol=1e-3)
max_abs_err = (output_cuda - output_pytorch).abs().max()
max_rel_err = ((output_cuda - output_pytorch).abs() / output_pytorch.abs()).max()
print(f'* {fwdok} check_forward_equal_with_pytorch_float: max_abs_err {max_abs_err:.2e} max_rel_err {max_rel_err:.2e}')
def check_gradient_numerical(channels=4, grad_value=True, grad_sampling_loc=True, grad_attn_weight=True):
value = torch.rand(N, S, M, channels).cuda() * 0.01
sampling_locations = torch.rand(N, Lq, M, L, P, 2).cuda()
attention_weights = torch.rand(N, Lq, M, L, P).cuda() + 1e-5
attention_weights /= attention_weights.sum(-1, keepdim=True).sum(-2, keepdim=True)
im2col_step = 2
func = MSDeformAttnFunction.apply
value.requires_grad = grad_value
sampling_locations.requires_grad = grad_sampling_loc
attention_weights.requires_grad = grad_attn_weight
gradok = gradcheck(func, (value.double(), shapes, level_start_index, sampling_locations.double(), attention_weights.double(), im2col_step))
print(f'* {gradok} check_gradient_numerical(D={channels})')
if __name__ == '__main__':
check_forward_equal_with_pytorch_double()
check_forward_equal_with_pytorch_float()
for channels in [30, 32, 64, 71, 1025, 2048, 3096]:
check_gradient_numerical(channels, True, True, True)

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.0 MiB

View File

View File

@@ -0,0 +1,2 @@
from .inference_sam_m2m_auto import *
from .inference_sam_m2m_interactive import *

View File

@@ -0,0 +1,103 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
import matplotlib.pyplot as plt
import cv2
import io
from segment_anything import SamAutomaticMaskGenerator
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
def inference_sam_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
mask_generator = SamAutomaticMaskGenerator(model)
outputs = mask_generator.generate(image_ori)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
# for ann in sorted_anns:
# mask = ann['segmentation']
# color_mask = np.random.random((1, 3)).tolist()[0]
# # color_mask = [int(c*255) for c in color_mask]
# demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# label += 1
# im = demo.get_image()
mask_map = np.zeros(image_ori.shape, dtype=np.uint8)
for i, ann in enumerate(sorted_anns):
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# assign the mask to the mask_map
mask_map[mask == 1] = label
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im, sorted_anns
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

@@ -0,0 +1,221 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import torch.nn.functional as F
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
from kornia.contrib import distance_transform
import matplotlib.pyplot as plt
import cv2
import io
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
from segment_anything import SamAutomaticMaskGenerator
from segment_anything.utils.amg import (
MaskData,
area_from_rle,
batch_iterator,
batched_mask_to_box,
box_xyxy_to_xywh,
build_all_layer_point_grids,
calculate_stability_score,
coco_encode_rle,
generate_crop_boxes,
is_box_near_crop_edge,
mask_to_rle_pytorch,
remove_small_regions,
rle_to_mask,
uncrop_boxes_xyxy,
uncrop_masks,
uncrop_points,
)
def sam_interactive_mask(mask_generator, points, in_points, in_labels, mask_input):
masks, iou_preds, _ = mask_generator.predictor.predict_torch(
in_points,
in_labels,
mask_input=mask_input,
multimask_output=True,
return_logits=True,
)
nm,_,h,w = masks.shape
# Serialize predictions and store in MaskData
data = MaskData(
masks=masks.flatten(0, 1),
iou_preds=iou_preds.flatten(0, 1),
points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
)
del masks
# Calculate stability score
data["stability_score"] = calculate_stability_score(
data["masks"], mask_generator.predictor.model.mask_threshold, mask_generator.stability_score_offset
)
masks = data["masks"].reshape(nm, -1, h, w)
scores = (data['iou_preds'] + data['stability_score']).reshape(nm, -1)
index = torch.stack([torch.arange(nm).cuda(), scores.argmax(dim=1)]).tolist()
return masks[index]
def inference_sam_m2m_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
orig_size = images.shape[-2:]
orig_h, orig_w = orig_size
crop_box = [0,0,orig_w,orig_h]
spatial_masks = spatial_masks[:, None].float().cuda()
spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
# generate single center point
# n,_,h,w = spatial_masks.shape
# mask_dt = (distance_transform((~F.pad(spatial_masks, pad=(1, 1, 1, 1), mode='constant', value=0)).float())[:,:,1:-1,1:-1]).reshape(n,-1)
# max_xy_idx = torch.stack([torch.arange(n), mask_dt.max(dim=-1)[1].cpu()]).tolist()
# next_mask = torch.zeros(spatial_masks.shape, device=torch.cuda.current_device()).bool()
# next_mask = next_mask.view(n,-1)
# next_mask[max_xy_idx] = True
# next_mask = next_mask.reshape((n,1,h,w))
# points = next_mask.nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
# stack sampled points
acc_points = []
for i in range(len(spatial_masks)):
points = spatial_masks[i:i+1].nonzero()[:,2:].flip(dims=[1]).cpu().numpy()
rand_ids = np.random.choice(points.shape[0], size=40, replace=True)
points = points[rand_ids]
acc_points.append(points)
_np = len(acc_points)
points = np.concatenate(acc_points)
mask_generator = SamAutomaticMaskGenerator(model)
mask_generator.predictor.set_image(image_ori)
im_size = image_ori.shape[:-1]
transformed_points = mask_generator.predictor.transform.apply_coords(points, im_size)
in_points = torch.as_tensor(transformed_points, device=mask_generator.predictor.device).reshape(_np,-1,2).transpose(0,1)
in_labels = torch.ones((in_points.shape[0], _np), dtype=torch.int, device=mask_generator.predictor.device)
masks = sam_interactive_mask(mask_generator, points, in_points.transpose(0,1), in_labels.transpose(0,1), None)
masks = masks > 0.0
iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
mask_data = MaskData(
masks=masks,
iou_preds=iou_preds,
points=points,
)
mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
del masks
mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
# Compress to RLE
mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
del mask_data["masks"]
mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
# Write mask records
outputs = []
for idx in range(len(mask_data["segmentations"])):
ann = {
"segmentation": mask_data["segmentations"][idx],
"area": area_from_rle(mask_data["rles"][idx]),
"bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
"predicted_iou": mask_data["iou_preds"][idx].item(),
"point_coords": [mask_data["points"][idx].tolist()],
"stability_score": mask_data["stability_score"][idx].item(),
"crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
}
outputs.append(ann)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
# for ann in sorted_anns:
# mask = ann['segmentation']
# demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# label += 1
# im = demo.get_image()
mask_map = np.zeros(image_ori.shape, dtype=np.uint8)
for i, ann in enumerate(sorted_anns):
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# assign the mask to the mask_map
mask_map[mask == 1] = label
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im, sorted_anns
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

View File

@@ -0,0 +1,3 @@
from .interactive_seem_m2m_auto import *
from .inference_seem_pano import *
from .inference_seem_interactive import *

View File

@@ -0,0 +1,382 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
import torch.nn as nn
from torchvision.ops.boxes import batched_nms, box_area # type: ignore
from typing import Any, Dict, List, Optional, Tuple
from segment_anything.modeling import Sam
from segment_anything.utils.amg import (
MaskData,
area_from_rle,
batch_iterator,
batched_mask_to_box,
box_xyxy_to_xywh,
build_all_layer_point_grids,
calculate_stability_score,
coco_encode_rle,
generate_crop_boxes,
is_box_near_crop_edge,
mask_to_rle_pytorch,
remove_small_regions,
rle_to_mask,
uncrop_boxes_xyxy,
uncrop_masks,
uncrop_points,
)
class SeemAutomaticMaskGenerator:
def __init__(
self,
model: Sam,
points_per_side: Optional[int] = 32,
points_per_batch: int = 64,
pred_iou_thresh: float = 0.9,
stability_score_thresh: float = 0.5,
stability_score_offset: float = 1.0,
box_nms_thresh: float = 0.7,
crop_n_layers: int = 0,
crop_nms_thresh: float = 0.7,
crop_overlap_ratio: float = 512 / 1500,
crop_n_points_downscale_factor: int = 1,
point_grids: Optional[List[np.ndarray]] = None,
min_mask_region_area: int = 0,
output_mode: str = "binary_mask",
) -> None:
"""
Using a SAM model, generates masks for the entire image.
Generates a grid of point prompts over the image, then filters
low quality and duplicate masks. The default settings are chosen
for SAM with a ViT-H backbone.
Arguments:
model (Sam): The SAM model to use for mask prediction.
points_per_side (int or None): The number of points to be sampled
along one side of the image. The total number of points is
points_per_side**2. If None, 'point_grids' must provide explicit
point sampling.
points_per_batch (int): Sets the number of points run simultaneously
by the model. Higher numbers may be faster but use more GPU memory.
pred_iou_thresh (float): A filtering threshold in [0,1], using the
model's predicted mask quality.
stability_score_thresh (float): A filtering threshold in [0,1], using
the stability of the mask under changes to the cutoff used to binarize
the model's mask predictions.
stability_score_offset (float): The amount to shift the cutoff when
calculated the stability score.
box_nms_thresh (float): The box IoU cutoff used by non-maximal
suppression to filter duplicate masks.
crop_n_layers (int): If >0, mask prediction will be run again on
crops of the image. Sets the number of layers to run, where each
layer has 2**i_layer number of image crops.
crop_nms_thresh (float): The box IoU cutoff used by non-maximal
suppression to filter duplicate masks between different crops.
crop_overlap_ratio (float): Sets the degree to which crops overlap.
In the first crop layer, crops will overlap by this fraction of
the image length. Later layers with more crops scale down this overlap.
crop_n_points_downscale_factor (int): The number of points-per-side
sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
point_grids (list(np.ndarray) or None): A list over explicit grids
of points used for sampling, normalized to [0,1]. The nth grid in the
list is used in the nth crop layer. Exclusive with points_per_side.
min_mask_region_area (int): If >0, postprocessing will be applied
to remove disconnected regions and holes in masks with area smaller
than min_mask_region_area. Requires opencv.
output_mode (str): The form masks are returned in. Can be 'binary_mask',
'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
For large resolutions, 'binary_mask' may consume large amounts of
memory.
"""
assert (points_per_side is None) != (
point_grids is None
), "Exactly one of points_per_side or point_grid must be provided."
if points_per_side is not None:
self.point_grids = build_all_layer_point_grids(
points_per_side,
crop_n_layers,
crop_n_points_downscale_factor,
)
elif point_grids is not None:
self.point_grids = point_grids
else:
raise ValueError("Can't have both points_per_side and point_grid be None.")
assert output_mode in [
"binary_mask",
"uncompressed_rle",
"coco_rle",
], f"Unknown output_mode {output_mode}."
if output_mode == "coco_rle":
from pycocotools import mask as mask_utils # type: ignore # noqa: F401
if min_mask_region_area > 0:
import cv2 # type: ignore # noqa: F401
self.predictor = model
self.points_per_batch = points_per_batch
self.pred_iou_thresh = pred_iou_thresh
self.stability_score_thresh = stability_score_thresh
self.stability_score_offset = stability_score_offset
self.box_nms_thresh = box_nms_thresh
self.crop_n_layers = crop_n_layers
self.crop_nms_thresh = crop_nms_thresh
self.crop_overlap_ratio = crop_overlap_ratio
self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
self.min_mask_region_area = min_mask_region_area
self.output_mode = output_mode
# dilate conv
self.dilation = nn.Conv2d(in_channels=1, out_channels=1, kernel_size=7, stride=1, padding=3, bias=False)
self.dilation.weight.data.fill_(1.0)
self.dilation.cuda()
@torch.no_grad()
def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
"""
Generates masks for the given image.
Arguments:
image (np.ndarray): The image to generate masks for, in HWC uint8 format.
Returns:
list(dict(str, any)): A list over records for masks. Each record is
a dict containing the following keys:
segmentation (dict(str, any) or np.ndarray): The mask. If
output_mode='binary_mask', is an array of shape HW. Otherwise,
is a dictionary containing the RLE.
bbox (list(float)): The box around the mask, in XYWH format.
area (int): The area in pixels of the mask.
predicted_iou (float): The model's own prediction of the mask's
quality. This is filtered by the pred_iou_thresh parameter.
point_coords (list(list(float))): The point coordinates input
to the model to generate this mask.
stability_score (float): A measure of the mask's quality. This
is filtered on using the stability_score_thresh parameter.
crop_box (list(float)): The crop of the image used to generate
the mask, given in XYWH format.
"""
# Generate masks
mask_data = self._generate_masks(image)
# Filter small disconnected regions and holes in masks
if self.min_mask_region_area > 0:
mask_data = self.postprocess_small_regions(
mask_data,
self.min_mask_region_area,
max(self.box_nms_thresh, self.crop_nms_thresh),
)
# Encode masks
if self.output_mode == "coco_rle":
mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
elif self.output_mode == "binary_mask":
mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
else:
mask_data["segmentations"] = mask_data["rles"]
# Write mask records
curr_anns = []
for idx in range(len(mask_data["segmentations"])):
ann = {
"segmentation": mask_data["segmentations"][idx],
"area": area_from_rle(mask_data["rles"][idx]),
"bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
"predicted_iou": mask_data["iou_preds"][idx].item(),
"point_coords": [mask_data["points"][idx].tolist()],
"stability_score": mask_data["stability_score"][idx].item(),
"crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
}
curr_anns.append(ann)
return curr_anns
def _generate_masks(self, image: np.ndarray) -> MaskData:
orig_size = image.shape[-2:]
crop_boxes, layer_idxs = generate_crop_boxes(
orig_size, self.crop_n_layers, self.crop_overlap_ratio
)
# Iterate over image crops
data = MaskData()
for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
data.cat(crop_data)
# Remove duplicate masks between crops
if len(crop_boxes) > 1:
# Prefer masks from smaller crops
scores = 1 / box_area(data["crop_boxes"])
scores = scores.to(data["boxes"].device)
keep_by_nms = batched_nms(
data["boxes"].float(),
scores,
torch.zeros_like(data["boxes"][:, 0]), # categories
iou_threshold=self.crop_nms_thresh,
)
data.filter(keep_by_nms)
data.to_numpy()
return data
def _process_crop(
self,
image: np.ndarray,
crop_box: List[int],
crop_layer_idx: int,
orig_size: Tuple[int, ...],
) -> MaskData:
# Crop the image and calculate embeddings
x0, y0, x1, y1 = crop_box
cropped_im = image#[y0:y1, x0:x1, :]
cropped_im_size = cropped_im.shape[-2:]
# self.predictor.set_image(cropped_im)
# Get points for this crop
points_scale = np.array(cropped_im_size)[None, ::-1]
points_for_image = self.point_grids[crop_layer_idx] #* points_scale
# Generate masks for this crop in batches
data = MaskData()
self.enc_features=None
for (points,) in batch_iterator(self.points_per_batch, points_for_image):
batch_data = self._process_batch(cropped_im, points, cropped_im_size, crop_box, orig_size)
data.cat(batch_data)
del batch_data
# Remove duplicates within this crop.
keep_by_nms = batched_nms(
data["boxes"].float(),
data["iou_preds"],
torch.zeros(len(data["boxes"])), # categories
iou_threshold=self.box_nms_thresh,
)
data.filter(keep_by_nms)
# Return to the original image frame
data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
return data
def _process_batch(
self,
images,
points: np.ndarray,
im_size: Tuple[int, ...],
crop_box: List[int],
orig_size: Tuple[int, ...],
) -> MaskData:
orig_h, orig_w = orig_size
data = {"image": images, "height": orig_h, "width": orig_w}
points = torch.tensor(points,dtype=torch.float).to(images.device)
# prepare interactive mask for seem
abs_points = (points * torch.tensor(orig_size)[None,:].to(points.device)).long()
abs_masks = torch.zeros((len(points), orig_h, orig_w), dtype=torch.bool).to(device=points.device)
abs_masks[torch.arange(0, abs_points.size(0))[:,None], abs_points[:,0:1], abs_points[:,1:2]] = True
abs_masks = self.dilation(abs_masks[:,None].float())[:,0] > 0
data['spatial_query'] = {'rand_shape': abs_masks[:,None]}
batch_inputs = [data]
if self.enc_features is None:
masks, iou_preds, mask_features, transformer_encoder_features, multi_scale_features = self.predictor.model.evaluate_demo(batch_inputs, None, None, return_features=True)
self.enc_features = (mask_features, transformer_encoder_features, multi_scale_features)
else:
masks, iou_preds = self.predictor.model.evaluate_demo(batch_inputs, self.enc_features[0], self.enc_features[1], self.enc_features[2])
data = MaskData(
masks=masks,
iou_preds=iou_preds,
points=points,
)
del masks
# Filter by predicted IoU
if self.pred_iou_thresh > 0.0:
keep_mask = data["iou_preds"] > self.pred_iou_thresh
data.filter(keep_mask)
# Calculate stability score
data["stability_score"] = calculate_stability_score(
data["masks"], 0.0, self.stability_score_offset
)
if self.stability_score_thresh > 0.0:
keep_mask = data["stability_score"] >= self.stability_score_thresh
data.filter(keep_mask)
# Threshold masks and calculate boxes
data["masks"] = data["masks"] > 0.0
data["boxes"] = batched_mask_to_box(data["masks"])
# Filter boxes that touch crop boundaries
keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
if not torch.all(keep_mask):
data.filter(keep_mask)
# Compress to RLE
data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
data["rles"] = mask_to_rle_pytorch(data["masks"])
del data["masks"]
return data
@staticmethod
def postprocess_small_regions(
mask_data: MaskData, min_area: int, nms_thresh: float
) -> MaskData:
"""
Removes small disconnected regions and holes in masks, then reruns
box NMS to remove any new duplicates.
Edits mask_data in place.
Requires open-cv as a dependency.
"""
if len(mask_data["rles"]) == 0:
return mask_data
# Filter small disconnected regions and holes
new_masks = []
scores = []
for rle in mask_data["rles"]:
mask = rle_to_mask(rle)
mask, changed = remove_small_regions(mask, min_area, mode="holes")
unchanged = not changed
mask, changed = remove_small_regions(mask, min_area, mode="islands")
unchanged = unchanged and not changed
new_masks.append(torch.as_tensor(mask).unsqueeze(0))
# Give score=0 to changed masks and score=1 to unchanged masks
# so NMS will prefer ones that didn't need postprocessing
scores.append(float(unchanged))
# Recalculate boxes and remove any new duplicates
masks = torch.cat(new_masks, dim=0)
boxes = batched_mask_to_box(masks)
keep_by_nms = batched_nms(
boxes.float(),
torch.as_tensor(scores),
torch.zeros_like(boxes[:, 0]), # categories
iou_threshold=nms_thresh,
)
# Only recalculate RLEs for masks that have changed
for i_mask in keep_by_nms:
if scores[i_mask] == 0.0:
mask_torch = masks[i_mask].unsqueeze(0)
mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly
mask_data.filter(keep_by_nms)
return mask_data

View File

@@ -0,0 +1,169 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import torch.nn.functional as F
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
import matplotlib.pyplot as plt
import cv2
import io
from .automatic_mask_generator import SeemAutomaticMaskGenerator
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
from segment_anything.utils.amg import (
MaskData,
area_from_rle,
batch_iterator,
batched_mask_to_box,
box_xyxy_to_xywh,
build_all_layer_point_grids,
calculate_stability_score,
coco_encode_rle,
generate_crop_boxes,
is_box_near_crop_edge,
mask_to_rle_pytorch,
remove_small_regions,
rle_to_mask,
uncrop_boxes_xyxy,
uncrop_masks,
uncrop_points,
)
def inference_seem_interactive(model, image, spatial_masks, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
orig_size = images.shape[-2:]
orig_h, orig_w = orig_size
crop_box = [0,0,orig_w,orig_h]
data = {"image": images, "height": orig_h, "width": orig_w}
spatial_masks = spatial_masks[:, None].float().cuda()
spatial_masks = F.interpolate(spatial_masks, size=(orig_h, orig_w), mode='bicubic', align_corners=False) > 0
data['spatial_query'] = {'rand_shape': spatial_masks}
model.model.metadata = metadata
masks, _ = model.model.evaluate_demo([data])
masks = masks > 0.0
iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
mask_data = MaskData(
masks=masks,
iou_preds=iou_preds,
points=points,
)
mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
del masks
mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
# Compress to RLE
mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
del mask_data["masks"]
mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
# Write mask records
outputs = []
for idx in range(len(mask_data["segmentations"])):
ann = {
"segmentation": mask_data["segmentations"][idx],
"area": area_from_rle(mask_data["rles"][idx]),
"bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
"predicted_iou": mask_data["iou_preds"][idx].item(),
"point_coords": [mask_data["points"][idx].tolist()],
"stability_score": mask_data["stability_score"][idx].item(),
"crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
}
outputs.append(ann)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
# for ann in sorted_anns:
# mask = ann['segmentation']
# color_mask = np.random.random((1, 3)).tolist()[0]
# # color_mask = [int(c*255) for c in color_mask]
# demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# label += 1
# im = demo.get_image()
mask_map = np.zeros(image_ori.shape, dtype=np.uint8)
for i, ann in enumerate(sorted_anns):
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# assign the mask to the mask_map
mask_map[mask == 1] = label
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im, sorted_anns
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

@@ -0,0 +1,164 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
import matplotlib.pyplot as plt
import cv2
import io
from .automatic_mask_generator import SeemAutomaticMaskGenerator
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
from segment_anything.utils.amg import (
MaskData,
area_from_rle,
batch_iterator,
batched_mask_to_box,
box_xyxy_to_xywh,
build_all_layer_point_grids,
calculate_stability_score,
coco_encode_rle,
generate_crop_boxes,
is_box_near_crop_edge,
mask_to_rle_pytorch,
remove_small_regions,
rle_to_mask,
uncrop_boxes_xyxy,
uncrop_masks,
uncrop_points,
)
def inference_seem_pano(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
orig_size = images.shape[-2:]
orig_h, orig_w = orig_size
crop_box = [0,0,orig_w,orig_h]
data = {"image": images, "height": orig_h, "width": orig_w}
batch_inputs = [data]
model.model.metadata = metadata
outputs = model.model.evaluate(batch_inputs)
pano_mask = outputs[0]['panoptic_seg'][0]
pano_info = outputs[0]['panoptic_seg'][1]
masks = []
for seg_info in pano_info:
masks += [pano_mask == seg_info['id']]
masks = torch.stack(masks, dim=0)
iou_preds = torch.ones(masks.shape[0], dtype=torch.float32)
points = torch.zeros((masks.shape[0], 2), dtype=torch.float32)
mask_data = MaskData(
masks=masks,
iou_preds=iou_preds,
points=points,
)
mask_data["stability_score"] = torch.ones(masks.shape[0], dtype=torch.float32)
del masks
mask_data["boxes"] = batched_mask_to_box(mask_data["masks"])
mask_data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(mask_data["boxes"]))])
# Compress to RLE
mask_data["masks"] = uncrop_masks(mask_data["masks"], crop_box, orig_h, orig_w)
mask_data["rles"] = mask_to_rle_pytorch(mask_data["masks"])
del mask_data["masks"]
mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
# Write mask records
outputs = []
for idx in range(len(mask_data["segmentations"])):
ann = {
"segmentation": mask_data["segmentations"][idx],
"area": area_from_rle(mask_data["rles"][idx]),
"bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
"predicted_iou": mask_data["iou_preds"][idx].item(),
"point_coords": [mask_data["points"][idx].tolist()],
"stability_score": mask_data["stability_score"][idx].item(),
"crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
}
outputs.append(ann)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
# create a full zero image as the image_orig
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
mask_map = np.zeros(image_ori.shape, dtype=np.uint8)
for i, ann in enumerate(sorted_anns):
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# assign the mask to the mask_map
mask_map[mask == 1] = label
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im, sorted_anns
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

@@ -0,0 +1,93 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
import matplotlib.pyplot as plt
import cv2
import io
from .automatic_mask_generator import SeemAutomaticMaskGenerator
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
def interactive_seem_m2m_auto(model, image, text_size, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
mask_generator = SeemAutomaticMaskGenerator(model)
outputs = mask_generator.generate(images)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
for ann in sorted_anns:
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

@@ -0,0 +1,6 @@
from .interactive_idino_m2m import interactive_infer_image as interactive_infer_image_idino_m2m
from .interactive_idino_m2m import interactive_infer_image_semantic, interactive_infer_image_3l
from .inference_semsam_m2m_auto import inference_semsam_m2m_auto
from .interactive_idino_1o1_box import interactive_infer_image_box as interactive_infer_image_idino_m2m_box
from .automatic_mask_generator import prompt_switch
from .interactive_predictor import SemanticSAMPredictor

View File

@@ -0,0 +1,393 @@
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
import numpy as np
import torch
from torchvision.ops.boxes import batched_nms, box_area # type: ignore
from typing import Any, Dict, List, Optional, Tuple
# from
# from .modeling import Sam
# from .predictor import SamPredictor
from semantic_sam.utils.sam_utils.amg import (
MaskData,
area_from_rle,
batch_iterator,
batched_mask_to_box,
box_xyxy_to_xywh,
build_all_layer_point_grids,
calculate_stability_score,
coco_encode_rle,
generate_crop_boxes,
is_box_near_crop_edge,
mask_to_rle_pytorch,
remove_small_regions,
rle_to_mask,
uncrop_boxes_xyxy,
uncrop_masks,
uncrop_points,
)
def prompt_switch(p):
p = int(p)
if p == 1:
return 3
if p == 2:
return 2
if p == 3:
return 0
if p == 4:
return 4
if p == 5:
return 1
if p == 6:
return 5
else:
raise NotImplementedError
class SemanticSamAutomaticMaskGenerator:
def __init__(
self,
model,
points_per_side: Optional[int] = 32,
points_per_batch: int = 200,
pred_iou_thresh: float = 0.88,
stability_score_thresh: float = 0.92,
stability_score_offset: float = 1.0,
box_nms_thresh: float = 0.7,
crop_n_layers: int = 0,
crop_nms_thresh: float = 0.7,
crop_overlap_ratio: float = 512 / 1500,
crop_n_points_downscale_factor: int = 1,
point_grids: Optional[List[np.ndarray]] = None,
min_mask_region_area: int = 10,
output_mode: str = "binary_mask",
level: list = [1, 2, 3, 4, 5, 6],
) -> None:
"""
Using a SAM model, generates masks for the entire image.
Generates a grid of point prompts over the image, then filters
low quality and duplicate masks. The default settings are chosen
for SAM with a ViT-H backbone.
Arguments:
model (Sam): The SAM model to use for mask prediction.
points_per_side (int or None): The number of points to be sampled
along one side of the image. The total number of points is
points_per_side**2. If None, 'point_grids' must provide explicit
point sampling.
points_per_batch (int): Sets the number of points run simultaneously
by the model. Higher numbers may be faster but use more GPU memory.
pred_iou_thresh (float): A filtering threshold in [0,1], using the
model's predicted mask quality.
stability_score_thresh (float): A filtering threshold in [0,1], using
the stability of the mask under changes to the cutoff used to binarize
the model's mask predictions.
stability_score_offset (float): The amount to shift the cutoff when
calculated the stability score.
box_nms_thresh (float): The box IoU cutoff used by non-maximal
suppression to filter duplicate masks.
crops_n_layers (int): If >0, mask prediction will be run again on
crops of the image. Sets the number of layers to run, where each
layer has 2**i_layer number of image crops.
crops_nms_thresh (float): The box IoU cutoff used by non-maximal
suppression to filter duplicate masks between different crops.
crop_overlap_ratio (float): Sets the degree to which crops overlap.
In the first crop layer, crops will overlap by this fraction of
the image length. Later layers with more crops scale down this overlap.
crop_n_points_downscale_factor (int): The number of points-per-side
sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
point_grids (list(np.ndarray) or None): A list over explicit grids
of points used for sampling, normalized to [0,1]. The nth grid in the
list is used in the nth crop layer. Exclusive with points_per_side.
min_mask_region_area (int): If >0, postprocessing will be applied
to remove disconnected regions and holes in masks with area smaller
than min_mask_region_area. Requires opencv.
output_mode (str): The form masks are returned in. Can be 'binary_mask',
'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
For large resolutions, 'binary_mask' may consume large amounts of
memory.
"""
self.level = [prompt_switch(l) for l in level]
assert (points_per_side is None) != (
point_grids is None
), "Exactly one of points_per_side or point_grid must be provided."
if points_per_side is not None:
self.point_grids = build_all_layer_point_grids(
points_per_side,
crop_n_layers,
crop_n_points_downscale_factor,
)
elif point_grids is not None:
self.point_grids = point_grids
else:
raise ValueError("Can't have both points_per_side and point_grid be None.")
assert output_mode in [
"binary_mask",
"uncompressed_rle",
"coco_rle",
], f"Unknown output_mode {output_mode}."
if output_mode == "coco_rle":
from pycocotools import mask as mask_utils # type: ignore # noqa: F401
if min_mask_region_area > 0:
import cv2 # type: ignore # noqa: F401
self.predictor = model
self.points_per_batch = points_per_batch
self.pred_iou_thresh = pred_iou_thresh
self.stability_score_thresh = stability_score_thresh
self.stability_score_offset = stability_score_offset
self.box_nms_thresh = box_nms_thresh
self.crop_n_layers = crop_n_layers
self.crop_nms_thresh = crop_nms_thresh
self.crop_overlap_ratio = crop_overlap_ratio
self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
self.min_mask_region_area = min_mask_region_area
self.output_mode = output_mode
@torch.no_grad()
def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
"""
Generates masks for the given image.
Arguments:
image (np.ndarray): The image to generate masks for, in HWC uint8 format.
Returns:
list(dict(str, any)): A list over records for masks. Each record is
a dict containing the following keys:
segmentation (dict(str, any) or np.ndarray): The mask. If
output_mode='binary_mask', is an array of shape HW. Otherwise,
is a dictionary containing the RLE.
bbox (list(float)): The box around the mask, in XYWH format.
area (int): The area in pixels of the mask.
predicted_iou (float): The model's own prediction of the mask's
quality. This is filtered by the pred_iou_thresh parameter.
point_coords (list(list(float))): The point coordinates input
to the model to generate this mask.
stability_score (float): A measure of the mask's quality. This
is filtered on using the stability_score_thresh parameter.
crop_box (list(float)): The crop of the image used to generate
the mask, given in XYWH format.
"""
# Generate masks
mask_data = self._generate_masks(image)
# Filter small disconnected regions and holes in masks
if self.min_mask_region_area > 0:
mask_data = self.postprocess_small_regions(
mask_data,
self.min_mask_region_area,
max(self.box_nms_thresh, self.crop_nms_thresh),
)
# Encode masks
if self.output_mode == "coco_rle":
mask_data["segmentations"] = [coco_encode_rle(rle) for rle in mask_data["rles"]]
elif self.output_mode == "binary_mask":
mask_data["segmentations"] = [rle_to_mask(rle) for rle in mask_data["rles"]]
else:
mask_data["segmentations"] = mask_data["rles"]
# Write mask records
curr_anns = []
for idx in range(len(mask_data["segmentations"])):
ann = {
"segmentation": mask_data["segmentations"][idx],
"area": area_from_rle(mask_data["rles"][idx]),
"bbox": box_xyxy_to_xywh(mask_data["boxes"][idx]).tolist(),
"predicted_iou": mask_data["iou_preds"][idx].item(),
"point_coords": [mask_data["points"][idx].tolist()],
"stability_score": mask_data["stability_score"][idx].item(),
"crop_box": box_xyxy_to_xywh(mask_data["crop_boxes"][idx]).tolist(),
}
curr_anns.append(ann)
return curr_anns
def _generate_masks(self, image: np.ndarray) -> MaskData:
orig_size = image.shape[-2:]
crop_boxes, layer_idxs = generate_crop_boxes(
orig_size, self.crop_n_layers, self.crop_overlap_ratio
)
# Iterate over image crops
assert len(crop_boxes)==1
data = MaskData()
# import ipdb; ipdb.set_trace()
for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
crop_data = self._process_crop(image, crop_box, layer_idx, orig_size)
data.cat(crop_data)
# import ipdb; ipdb.set_trace()
# Remove duplicate masks between crops
if len(crop_boxes) > 1:
# Prefer masks from smaller crops
scores = 1 / box_area(data["crop_boxes"])
scores = scores.to(data["boxes"].device)
keep_by_nms = batched_nms(
data["boxes"].float(),
scores,
torch.zeros(len(data["boxes"])), # categories
iou_threshold=self.crop_nms_thresh,
)
data.filter(keep_by_nms)
data.to_numpy()
return data
def _process_crop(
self,
image: np.ndarray,
crop_box: List[int],
crop_layer_idx: int,
orig_size: Tuple[int, ...],
) -> MaskData:
# Crop the image and calculate embeddings
x0, y0, x1, y1 = crop_box
cropped_im = image#[y0:y1, x0:x1, :]
cropped_im_size = cropped_im.shape[-2:]
# self.predictor.set_image(cropped_im)
# Get points for this crop
points_scale = np.array(cropped_im_size)[None, ::-1]
points_for_image = self.point_grids[crop_layer_idx] #* points_scale
# Generate masks for this crop in batches
data = MaskData()
self.enc_features=None
# import ipdb; ipdb.set_trace()
for (points,) in batch_iterator(self.points_per_batch, points_for_image):
batch_data = self._process_batch(cropped_im,points, cropped_im_size, crop_box, orig_size)
data.cat(batch_data)
del batch_data
keep_by_nms = batched_nms(
data["boxes"].float(),
data["iou_preds"],
torch.zeros(len(data["boxes"])), # categories
iou_threshold=self.box_nms_thresh,
)
# import ipdb; ipdb.set_trace()
data.filter(keep_by_nms)
# import ipdb; ipdb.set_trace()
# Return to the original image frame
data["boxes"] = uncrop_boxes_xyxy(data["boxes"], crop_box)
data["crop_boxes"] = torch.tensor([crop_box for _ in range(len(data["rles"]))])
return data
def _process_batch(
self,
images,
points: np.ndarray,
im_size: Tuple[int, ...],
crop_box: List[int],
orig_size: Tuple[int, ...],
) -> MaskData:
orig_h, orig_w = orig_size
data = {"image": images, "height": orig_h, "width": orig_w}
points=torch.tensor(points,dtype=torch.float).to(images.device)
points = torch.cat([points, points.new_tensor([[0.005, 0.005]]).repeat(len(points), 1)], dim=-1)
data['targets'] = [dict()]
data['targets'][0]['points']=points
data['targets'][0]['pb']=points.new_tensor([0.]*len(points))
batch_inputs = [data]
if self.enc_features is None:
masks, iou_preds,mask_features,multi_scale_features= self.predictor.model.evaluate_demo(batch_inputs,None,None,return_features=True, level=self.level)
self.enc_features=(mask_features,multi_scale_features)
else:
masks, iou_preds= self.predictor.model.evaluate_demo(batch_inputs,None,None,self.enc_features[0],self.enc_features[1], level=self.level)
data = MaskData(
masks=masks,
iou_preds=iou_preds.flatten(),
points=torch.as_tensor(points[:,None].repeat(1,len(self.level), 1).view(-1,4)),
)
del masks
# Filter by predicted IoU
keep_mask = data["iou_preds"] > self.pred_iou_thresh
data.filter(keep_mask)
# Calculate stability score
data["stability_score"] = calculate_stability_score(
data["masks"], 0.0, self.stability_score_offset
)
# if self.stability_score_thresh > 0.0:
keep_mask = data["stability_score"] >= self.stability_score_thresh
data.filter(keep_mask)
# Threshold masks and calculate boxes
data["masks"] = data["masks"] > 0.0
data["boxes"] = batched_mask_to_box(data["masks"])
# Filter boxes that touch crop boundaries
keep_mask = ~is_box_near_crop_edge(data["boxes"], crop_box, [0, 0, orig_w, orig_h])
if not torch.all(keep_mask):
data.filter(keep_mask)
# Compress to RLE
data["masks"] = uncrop_masks(data["masks"], crop_box, orig_h, orig_w)
data["rles"] = mask_to_rle_pytorch(data["masks"])
del data["masks"]
return data
@staticmethod
def postprocess_small_regions(
mask_data: MaskData, min_area: int, nms_thresh: float
) -> MaskData:
"""
Removes small disconnected regions and holes in masks, then reruns
box NMS to remove any new duplicates.
Edits mask_data in place.
Requires open-cv as a dependency.
"""
if len(mask_data["rles"]) == 0:
return mask_data
# Filter small disconnected regions and holes
new_masks = []
scores = []
for rle in mask_data["rles"]:
mask = rle_to_mask(rle)
mask, changed = remove_small_regions(mask, min_area, mode="holes")
unchanged = not changed
mask, changed = remove_small_regions(mask, min_area, mode="islands")
unchanged = unchanged and not changed
new_masks.append(torch.as_tensor(mask).unsqueeze(0))
# Give score=0 to changed masks and score=1 to unchanged masks
# so NMS will prefer ones that didn't need postprocessing
scores.append(float(unchanged))
# Recalculate boxes and remove any new duplicates
masks = torch.cat(new_masks, dim=0)
boxes = batched_mask_to_box(masks)
keep_by_nms = batched_nms(
boxes.float(),
torch.as_tensor(scores),
torch.zeros(len(boxes)), # categories
iou_threshold=nms_thresh,
)
# Only recalculate RLEs for masks that have changed
for i_mask in keep_by_nms:
if scores[i_mask] == 0.0:
mask_torch = masks[i_mask].unsqueeze(0)
mask_data["rles"][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
mask_data["boxes"][i_mask] = boxes[i_mask] # update res directly
mask_data.filter(keep_by_nms)
return mask_data

View File

@@ -0,0 +1,108 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
import matplotlib.pyplot as plt
import cv2
import io
from .automatic_mask_generator import SemanticSamAutomaticMaskGenerator
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
def inference_semsam_m2m_auto(model, image, level, all_classes, all_parts, thresh, text_size, hole_scale, island_scale, semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image)
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
mask_generator = SemanticSamAutomaticMaskGenerator(model,points_per_side=32,
pred_iou_thresh=0.88,
stability_score_thresh=0.92,
min_mask_region_area=10,
level=level,
)
outputs = mask_generator.generate(images)
from task_adapter.utils.visualizer import Visualizer
visual = Visualizer(image_ori, metadata=metadata)
sorted_anns = sorted(outputs, key=(lambda x: x['area']), reverse=True)
label = 1
# for ann in sorted_anns:
# mask = ann['segmentation']
# color_mask = np.random.random((1, 3)).tolist()[0]
# # color_mask = [int(c*255) for c in color_mask]
# demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# label += 1
# im = demo.get_image()
mask_map = np.zeros(image_ori.shape, dtype=np.uint8)
for i, ann in enumerate(sorted_anns):
mask = ann['segmentation']
color_mask = np.random.random((1, 3)).tolist()[0]
# color_mask = [int(c*255) for c in color_mask]
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
# assign the mask to the mask_map
mask_map[mask == 1] = label
label += 1
im = demo.get_image()
# fig=plt.figure(figsize=(10, 10))
# plt.imshow(image_ori)
# show_anns(outputs)
# fig.canvas.draw()
# im=Image.frombytes('RGB', fig.canvas.get_width_height(), fig.canvas.tostring_rgb())
return im, sorted_anns
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True
def show_anns(anns):
if len(anns) == 0:
return
sorted_anns = sorted(anns, key=(lambda x: x['area']), reverse=True)
ax = plt.gca()
ax.set_autoscale_on(False)
polygons = []
color = []
for ann in sorted_anns:
m = ann['segmentation']
img = np.ones((m.shape[0], m.shape[1], 3))
color_mask = np.random.random((1, 3)).tolist()[0]
for i in range(3):
img[:,:,i] = color_mask[i]
ax.imshow(np.dstack((img, m*0.35)))

View File

@@ -0,0 +1,144 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
from detectron2.structures import BitMasks
from semantic_sam.utils import box_ops
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
def interactive_infer_image_box(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image['image'])
mask_ori = transform1(image['mask'])
width = image_ori.size[0]
height = image_ori.size[1]
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
data = {"image": images, "height": height, "width": width}
mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
flaten_mask = mask_ori.unsqueeze(0)
# import ipdb; ipdb.set_trace()
points=mask_ori.nonzero().float().to(images.device)
if len(points)==0:
point_=point=points.new_tensor([[0.5,0.5,0.5,0.5]])
else:
mean_point=points.mean(0)[None]
box_xyxy = BitMasks(flaten_mask > 0).get_bounding_boxes().tensor
h = mask_ori.shape[0]
w = mask_ori.shape[1]
box_xywh = (box_ops.box_xyxy_to_cxcywh(box_xyxy) / torch.as_tensor([w, h, w, h])).cuda()
# point_=points.mean(0)[None]
# point=point_.clone()
# point[0, 0] = point_[0, 0] / mask_ori.shape[0]
# point[0, 1] = point_[0, 1] / mask_ori.shape[1]
# point = point[:, [1, 0]]
point=box_xywh
data['targets'] = [dict()]
data['targets'][0]['points']=point
data['targets'][0]['pb']=point.new_tensor([1.])
batch_inputs = [data]
masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, task='demo_box')
pred_masks_poses = masks
reses=[]
ious=ious[0,0]
ids=torch.argsort(ious,descending=True)
text_res=''
try:
thresh=float(thresh)
except Exception:
thresh=0.0
mask_ls=[]
ious_res=[]
areas=[]
for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
iou=round(float(iou),2)
texts=f'{iou}'
mask=(pred_masks_pos>0.0).cpu().numpy()
area=mask.sum()
conti=False
if iou<thresh:
conti=True
for m in mask_ls:
if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
conti=True
break
if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
conti=False
if conti:
continue
ious_res.append(iou)
mask_ls.append(mask)
areas.append(area)
mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
mask=(mask).astype(np.float)
out_txt = texts
visual = Visualizer(image_ori, metadata=metadata)
color=[0.,0.,1.0]
demo = visual.draw_binary_mask(mask, color=color, text=texts)
demo = visual.draw_box(box_xyxy[0])
res = demo.get_image()
# point_x0=max(0,int(point_[0, 1])-3)
# point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
# point_y0 = max(0, int(point_[0, 0]) - 3)
# point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
# res[point_y0:point_y1,point_x0:point_x1,0]=255
# res[point_y0:point_y1,point_x0:point_x1,1]=0
# res[point_y0:point_y1,point_x0:point_x1,2]=0
reses.append(Image.fromarray(res))
text_res=text_res+';'+out_txt
ids=list(torch.argsort(torch.tensor(areas),descending=False))
ids = [int(i) for i in ids]
torch.cuda.empty_cache()
return reses,[reses[i] for i in ids]
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True

View File

@@ -0,0 +1,322 @@
# --------------------------------------------------------
# Semantic-SAM: Segment and Recognize Anything at Any Granularity
# Copyright (c) 2023 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Hao Zhang (hzhangcx@connect.ust.hk)
# --------------------------------------------------------
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
def interactive_infer_image(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None, label_mode='1', alpha=0.1, anno_mode=['Mask']):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image['image'])
mask_ori = transform1(image['mask'])
width = image_ori.size[0]
height = image_ori.size[1]
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
data = {"image": images, "height": height, "width": width}
mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
points=mask_ori.nonzero().float().to(images.device)
if len(points)==0:
point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
else:
point_=points.mean(0)[None]
point=point_.clone()
point[0, 0] = point_[0, 0] / mask_ori.shape[0]
point[0, 1] = point_[0, 1] / mask_ori.shape[1]
point = point[:, [1, 0]]
point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
data['targets'] = [dict()]
data['targets'][0]['points']=point
data['targets'][0]['pb']=point.new_tensor([0.])
batch_inputs = [data]
masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
pred_masks_poses = masks
reses=[]
ious=ious[0,0]
ids=torch.argsort(ious,descending=True)
text_res=''
try:
thresh=float(thresh)
except Exception:
thresh=0.0
mask_ls=[]
ious_res=[]
areas=[]
for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
iou=round(float(iou),2)
texts=f'{iou}'
mask=(pred_masks_pos>0.0).cpu().numpy()
area=mask.sum()
conti=False
if iou<thresh:
conti=True
for m in mask_ls:
if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
conti=True
break
if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
conti=False
if conti:
continue
ious_res.append(iou)
mask_ls.append(mask)
areas.append(area)
mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
mask=(mask).astype(np.float)
out_txt = texts
visual = Visualizer(image_ori, metadata=metadata)
color=[0.,0.,1.0]
# demo = visual.draw_binary_mask(mask, color=color, text=texts)
demo = visual.draw_binary_mask_with_number(mask, text=str(label), label_mode=label_mode, alpha=alpha, anno_mode=anno_mode)
res = demo.get_image()
point_x0=max(0,int(point_[0, 1])-3)
point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
point_y0 = max(0, int(point_[0, 0]) - 3)
point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
# res[point_y0:point_y1,point_x0:point_x1,0]=255
# res[point_y0:point_y1,point_x0:point_x1,1]=0
# res[point_y0:point_y1,point_x0:point_x1,2]=0
reses.append(Image.fromarray(res))
text_res=text_res+';'+out_txt
ids=list(torch.argsort(torch.tensor(areas),descending=False))
ids = [int(i) for i in ids]
torch.cuda.empty_cache()
return reses,[reses[i] for i in ids]
def interactive_infer_image_3l(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image['image'])
mask_ori = transform1(image['mask'])
width = image_ori.size[0]
height = image_ori.size[1]
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
data = {"image": images, "height": height, "width": width}
mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
points=mask_ori.nonzero().float().to(images.device)
if len(points)==0:
point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
else:
point_=points.mean(0)[None]
point=point_.clone()
point[0, 0] = point_[0, 0] / mask_ori.shape[0]
point[0, 1] = point_[0, 1] / mask_ori.shape[1]
point = point[:, [1, 0]]
point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
data['targets'] = [dict()]
data['targets'][0]['points']=point
data['targets'][0]['pb']=point.new_tensor([0.])
batch_inputs = [data]
masks, ious, pred_class, pred_class_score = model.model.evaluate_demo(batch_inputs,all_classes,all_parts, level=[0,1,2])
pred_masks_poses = masks
reses=[]
ious=ious[0,0]
ids=torch.argsort(ious,descending=True)
text_res=''
try:
thresh=float(thresh)
except Exception:
thresh=0.0
mask_ls=[]
ious_res=[]
areas=[]
new_pred_class = []
new_pred_class_score = []
for i in ids:
new_pred_class_score.append(pred_class_score[i])
new_pred_class.append(pred_class[i])
# import ipdb; ipdb.set_trace()
for i,(pred_masks_pos,iou, cls_name, cls_score) in enumerate(zip(pred_masks_poses[ids],ious[ids], new_pred_class, new_pred_class_score)):
iou=round(float(iou),2)
texts=f'{iou}_{cls_name}_{cls_score}'
mask=(pred_masks_pos>0.0).cpu().numpy()
area=mask.sum()
conti=False
if iou<thresh:
conti=True
for m in mask_ls:
if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
conti=True
break
if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
conti=False
if conti:
continue
ious_res.append(iou)
mask_ls.append(mask)
areas.append(area)
mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
mask=(mask).astype(np.float)
out_txt = texts
visual = Visualizer(image_ori, metadata=metadata)
color=[0.,0.,1.0]
demo = visual.draw_binary_mask(mask, color=color, text=texts)
res = demo.get_image()
point_x0=max(0,int(point_[0, 1])-3)
point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
point_y0 = max(0, int(point_[0, 0]) - 3)
point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
res[point_y0:point_y1,point_x0:point_x1,0]=255
res[point_y0:point_y1,point_x0:point_x1,1]=0
res[point_y0:point_y1,point_x0:point_x1,2]=0
reses.append(Image.fromarray(res))
text_res=text_res+';'+out_txt
ids=list(torch.argsort(torch.tensor(areas),descending=False))
ids = [int(i) for i in ids]
torch.cuda.empty_cache()
return reses,[reses[i] for i in ids]
def interactive_infer_image_semantic(model, image,all_classes,all_parts, thresh,text_size,hole_scale,island_scale,semantic, refimg=None, reftxt=None, audio_pth=None, video_pth=None):
t = []
t.append(transforms.Resize(int(text_size), interpolation=Image.BICUBIC))
transform1 = transforms.Compose(t)
image_ori = transform1(image['image'])
mask_ori = transform1(image['mask'])
width = image_ori.size[0]
height = image_ori.size[1]
image_ori = np.asarray(image_ori)
images = torch.from_numpy(image_ori.copy()).permute(2,0,1).cuda()
all_classes, all_parts=all_classes.strip().strip("\"[]").split(':'),all_parts.strip().strip("\"[]").split(':')
data = {"image": images, "height": height, "width": width}
mask_ori = np.asarray(mask_ori)[:,:,0:1].copy()
mask_ori = torch.from_numpy(mask_ori).permute(2,0,1)[0]
points=mask_ori.nonzero().float().to(images.device)
if len(points)==0:
point_=point=points.new_tensor([[0.5,0.5,0.006,0.006]])
else:
point_=points.mean(0)[None]
point=point_.clone()
point[0, 0] = point_[0, 0] / mask_ori.shape[0]
point[0, 1] = point_[0, 1] / mask_ori.shape[1]
point = point[:, [1, 0]]
point=torch.cat([point,points.new_tensor([[0.005,0.005]])],dim=-1)
data['targets'] = [dict()]
data['targets'][0]['points']=point
data['targets'][0]['pb']=point.new_tensor([0.])
data['targets'][0]['pb']=point.new_tensor([1.])
batch_inputs = [data]
masks,ious = model.model.evaluate_demo(batch_inputs,all_classes,all_parts)
pred_masks_poses = masks
reses=[]
ious=ious[0,0]
ids=torch.argsort(ious,descending=True)
text_res=''
try:
thresh=float(thresh)
except Exception:
thresh=0.0
mask_ls=[]
ious_res=[]
areas=[]
for i,(pred_masks_pos,iou) in enumerate(zip(pred_masks_poses[ids],ious[ids])):
iou=round(float(iou),2)
texts=f'{iou}'
mask=(pred_masks_pos>0.0).cpu().numpy()
area=mask.sum()
conti=False
if iou<thresh:
conti=True
for m in mask_ls:
if np.logical_and(mask,m).sum()/np.logical_or(mask,m).sum()>0.95:
conti=True
break
if i == len(pred_masks_poses[ids])-1 and mask_ls==[]:
conti=False
if conti:
continue
ious_res.append(iou)
mask_ls.append(mask)
areas.append(area)
mask,_=remove_small_regions(mask,int(hole_scale),mode="holes")
mask,_=remove_small_regions(mask,int(island_scale),mode="islands")
mask=(mask).astype(np.float)
out_txt = texts
visual = Visualizer(image_ori, metadata=metadata)
color=[0.,0.,1.0]
demo = visual.draw_binary_mask(mask, color=color, text=texts)
res = demo.get_image()
point_x0=max(0,int(point_[0, 1])-3)
point_x1=min(mask_ori.shape[1],int(point_[0, 1])+3)
point_y0 = max(0, int(point_[0, 0]) - 3)
point_y1 = min(mask_ori.shape[0], int(point_[0, 0]) + 3)
res[point_y0:point_y1,point_x0:point_x1,0]=255
res[point_y0:point_y1,point_x0:point_x1,1]=0
res[point_y0:point_y1,point_x0:point_x1,2]=0
reses.append(Image.fromarray(res))
text_res=text_res+';'+out_txt
ids=list(torch.argsort(torch.tensor(areas),descending=False))
ids = [int(i) for i in ids]
torch.cuda.empty_cache()
return reses,[reses[i] for i in ids]
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True

View File

@@ -0,0 +1,139 @@
import torch
import numpy as np
from torchvision import transforms
from task_adapter.utils.visualizer import Visualizer
from typing import Tuple
from PIL import Image
from detectron2.data import MetadataCatalog
metadata = MetadataCatalog.get('coco_2017_train_panoptic')
class SemanticSAMPredictor:
def __init__(self, model, thresh=0.5, text_size=640, hole_scale=100, island_scale=100):
"""
thresh: iou thresh to filter low confidence objects
text_size: resize the input image short edge for the model to process
hole_scale: fill in small holes as in SAM
island_scale: remove small regions as in SAM
"""
self.model = model
self.thresh = thresh
self.text_size = hole_scale
self.hole_scale = hole_scale
self.island_scale = island_scale
self.point = None
def predict(self, image_ori, image, point=None):
"""
produce up to 6 prediction results for each click
"""
width = image_ori.shape[0]
height = image_ori.shape[1]
data = {"image": image, "height": height, "width": width}
# import ipdb; ipdb.set_trace()
if point is None:
point = torch.tensor([[0.5, 0.5, 0.006, 0.006]]).cuda()
else:
point = torch.tensor(point).cuda()
point_ = point
point = point_.clone()
point[0, 0] = point_[0, 0]
point[0, 1] = point_[0, 1]
# point = point[:, [1, 0]]
point = torch.cat([point, point.new_tensor([[0.005, 0.005]])], dim=-1)
self.point = point[:, :2].clone()*(torch.tensor([width, height]).to(point))
data['targets'] = [dict()]
data['targets'][0]['points'] = point
data['targets'][0]['pb'] = point.new_tensor([0.])
batch_inputs = [data]
masks, ious = self.model.model.evaluate_demo(batch_inputs)
return masks, ious
def process_multi_mask(self, masks, ious, image_ori):
pred_masks_poses = masks
reses = []
ious = ious[0, 0]
ids = torch.argsort(ious, descending=True)
text_res = ''
mask_ls = []
ious_res = []
areas = []
for i, (pred_masks_pos, iou) in enumerate(zip(pred_masks_poses[ids], ious[ids])):
iou = round(float(iou), 2)
texts = f'{iou}'
mask = (pred_masks_pos > 0.0).cpu().numpy()
area = mask.sum()
conti = False
if iou < self.thresh:
conti = True
for m in mask_ls:
if np.logical_and(mask, m).sum() / np.logical_or(mask, m).sum() > 0.95:
conti = True
break
if i == len(pred_masks_poses[ids]) - 1 and mask_ls == []:
conti = False
if conti:
continue
ious_res.append(iou)
mask_ls.append(mask)
areas.append(area)
mask, _ = self.remove_small_regions(mask, int(self.hole_scale), mode="holes")
mask, _ = self.remove_small_regions(mask, int(self.island_scale), mode="islands")
mask = (mask).astype(np.float)
out_txt = texts
visual = Visualizer(image_ori, metadata=metadata)
color = [0., 0., 1.0]
demo = visual.draw_binary_mask(mask, color=color, text=texts)
res = demo.get_image()
point_x0 = max(0, int(self.point[0, 0]) - 3)
point_x1 = min(image_ori.shape[1], int(self.point[0, 0]) + 3)
point_y0 = max(0, int(self.point[0, 1]) - 3)
point_y1 = min(image_ori.shape[0], int(self.point[0, 1]) + 3)
res[point_y0:point_y1, point_x0:point_x1, 0] = 255
res[point_y0:point_y1, point_x0:point_x1, 1] = 0
res[point_y0:point_y1, point_x0:point_x1, 2] = 0
reses.append(Image.fromarray(res))
text_res = text_res + ';' + out_txt
ids = list(torch.argsort(torch.tensor(areas), descending=False))
ids = [int(i) for i in ids]
torch.cuda.empty_cache()
return reses, [reses[i] for i in ids]
def predict_masks(self, image_ori, image, point=None):
masks, ious = self.predict(image_ori, image, point)
return self.process_multi_mask(masks, ious, image_ori)
@staticmethod
def remove_small_regions(
mask: np.ndarray, area_thresh: float, mode: str
) -> Tuple[np.ndarray, bool]:
"""
Removes small disconnected regions and holes in a mask. Returns the
mask and an indicator of if the mask has been modified.
"""
import cv2 # type: ignore
assert mode in ["holes", "islands"]
correct_holes = mode == "holes"
working_mask = (correct_holes ^ mask).astype(np.uint8)
n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(working_mask, 8)
sizes = stats[:, -1][1:] # Row 0 is background label
small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
if len(small_regions) == 0:
return mask, False
fill_labels = [0] + small_regions
if not correct_holes:
fill_labels = [i for i in range(n_labels) if i not in fill_labels]
# If every region is below threshold, keep largest
if len(fill_labels) == 0:
fill_labels = [int(np.argmax(sizes)) + 1]
mask = np.isin(regions, fill_labels)
return mask, True

File diff suppressed because it is too large Load Diff