Merge branch 'main' of github.com:xlang-ai/OSWorld
This commit is contained in:
9
.gitignore
vendored
9
.gitignore
vendored
@@ -197,4 +197,11 @@ vmware_vm_data
|
||||
|
||||
.vscode
|
||||
|
||||
dataimpulse_proxy_config.json
|
||||
dataimpulse_proxy_config.json
|
||||
|
||||
## reference and draft and debug
|
||||
reference/
|
||||
draft/
|
||||
manual_examine.py
|
||||
run_human_examine.sh
|
||||
quick_start.py
|
||||
@@ -813,7 +813,7 @@ class SetupController:
|
||||
|
||||
def _update_browse_history_setup(self, **config):
|
||||
cache_path = os.path.join(self.cache_dir, "history_new.sqlite")
|
||||
db_url = "https://drive.usercontent.google.com/u/0/uc?id=1Lv74QkJYDWVX0RIgg0Co-DUcoYpVL0oX&export=download" # google drive
|
||||
db_url = "https://huggingface.co/datasets/xlangai/ubuntu_osworld_file_cache/resolve/main/chrome/44ee5668-ecd5-4366-a6ce-c1c9b8d4e938/history_empty.sqlite?download=true"
|
||||
if not os.path.exists(cache_path):
|
||||
max_retries = 3
|
||||
downloaded = False
|
||||
@@ -839,80 +839,82 @@ class SetupController:
|
||||
else:
|
||||
logger.info("File already exists in cache directory")
|
||||
# copy a new history file in the tmp folder
|
||||
db_path = cache_path
|
||||
with tempfile.TemporaryDirectory() as tmp_dir:
|
||||
db_path = os.path.join(tmp_dir, "history_empty.sqlite")
|
||||
shutil.copy(cache_path, db_path)
|
||||
|
||||
history = config['history']
|
||||
history = config['history']
|
||||
|
||||
for history_item in history:
|
||||
url = history_item['url']
|
||||
title = history_item['title']
|
||||
visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
|
||||
for history_item in history:
|
||||
url = history_item['url']
|
||||
title = history_item['title']
|
||||
visit_time = datetime.now() - timedelta(seconds=history_item['visit_time_from_now_in_seconds'])
|
||||
|
||||
# Chrome use ms from 1601-01-01 as timestamp
|
||||
epoch_start = datetime(1601, 1, 1)
|
||||
chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
|
||||
# Chrome use ms from 1601-01-01 as timestamp
|
||||
epoch_start = datetime(1601, 1, 1)
|
||||
chrome_timestamp = int((visit_time - epoch_start).total_seconds() * 1000000)
|
||||
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, 1, 0, chrome_timestamp, 0))
|
||||
cursor.execute('''
|
||||
INSERT INTO urls (url, title, visit_count, typed_count, last_visit_time, hidden)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url, title, 1, 0, chrome_timestamp, 0))
|
||||
|
||||
url_id = cursor.lastrowid
|
||||
url_id = cursor.lastrowid
|
||||
|
||||
cursor.execute('''
|
||||
INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
|
||||
cursor.execute('''
|
||||
INSERT INTO visits (url, visit_time, from_visit, transition, segment_id, visit_duration)
|
||||
VALUES (?, ?, ?, ?, ?, ?)
|
||||
''', (url_id, chrome_timestamp, 0, 805306368, 0, 0))
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
logger.info('Fake browsing history added successfully.')
|
||||
logger.info('Fake browsing history added successfully.')
|
||||
|
||||
controller = PythonController(self.vm_ip, self.server_port)
|
||||
controller = PythonController(self.vm_ip, self.server_port)
|
||||
|
||||
# get the path of the history file according to the platform
|
||||
os_type = controller.get_vm_platform()
|
||||
# get the path of the history file according to the platform
|
||||
os_type = controller.get_vm_platform()
|
||||
|
||||
if os_type == 'Windows':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Darwin':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"""import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
elif os_type == 'Linux':
|
||||
if "arm" in platform.machine():
|
||||
if os_type == 'Windows':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
|
||||
"""import os; print(os.path.join(os.getenv('USERPROFILE'), "AppData", "Local", "Google", "Chrome", "User Data", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
else:
|
||||
elif os_type == 'Darwin':
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
|
||||
"""import os; print(os.path.join(os.getenv('HOME'), "Library", "Application Support", "Google", "Chrome", "Default", "History"))""")[
|
||||
'output'].strip()
|
||||
else:
|
||||
raise Exception('Unsupported operating system')
|
||||
|
||||
form = MultipartEncoder({
|
||||
"file_path": chrome_history_path,
|
||||
"file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
|
||||
})
|
||||
headers = {"Content-Type": form.content_type}
|
||||
logger.debug(form.content_type)
|
||||
|
||||
# send request to server to upload file
|
||||
try:
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
|
||||
response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
elif os_type == 'Linux':
|
||||
if "arm" in platform.machine():
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), 'snap', 'chromium', 'common', 'chromium', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
chrome_history_path = controller.execute_python_command(
|
||||
"import os; print(os.path.join(os.getenv('HOME'), '.config', 'google-chrome', 'Default', 'History'))")[
|
||||
'output'].strip()
|
||||
else:
|
||||
logger.error("Failed to upload file. Status code: %s", response.text)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error("An error occurred while trying to send the request: %s", e)
|
||||
raise Exception('Unsupported operating system')
|
||||
|
||||
self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
|
||||
form = MultipartEncoder({
|
||||
"file_path": chrome_history_path,
|
||||
"file_data": (os.path.basename(chrome_history_path), open(db_path, "rb"))
|
||||
})
|
||||
headers = {"Content-Type": form.content_type}
|
||||
logger.debug(form.content_type)
|
||||
|
||||
# send request to server to upload file
|
||||
try:
|
||||
logger.debug("REQUEST ADDRESS: %s", self.http_server + "/setup" + "/upload")
|
||||
response = requests.post(self.http_server + "/setup" + "/upload", headers=headers, data=form)
|
||||
if response.status_code == 200:
|
||||
logger.info("Command executed successfully: %s", response.text)
|
||||
else:
|
||||
logger.error("Failed to upload file. Status code: %s", response.text)
|
||||
except requests.exceptions.RequestException as e:
|
||||
logger.error("An error occurred while trying to send the request: %s", e)
|
||||
|
||||
self._execute_setup(["sudo chown -R user:user /home/user/.config/google-chrome/Default/History"], shell=True)
|
||||
|
||||
@@ -29,13 +29,11 @@ UBUNTU_X86_URL = "https://huggingface.co/datasets/xlangai/ubuntu_osworld/resolve
|
||||
WINDOWS_X86_URL = "https://huggingface.co/datasets/xlangai/windows_osworld/resolve/main/Windows-x86.zip"
|
||||
|
||||
# Determine the platform and CPU architecture to decide the correct VM image to download
|
||||
if platform.system() == 'Darwin': # macOS
|
||||
# if os.uname().machine == 'arm64': # Apple Silicon
|
||||
URL = UBUNTU_ARM_URL
|
||||
# else:
|
||||
# url = UBUNTU_X86_URL
|
||||
elif platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
# sometimes the system is 'Darwin' but the machine is x86-based.
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
elif platform.system() == 'Darwin': # macOS
|
||||
URL = UBUNTU_ARM_URL
|
||||
else:
|
||||
raise Exception("Unsupported platform or architecture")
|
||||
|
||||
@@ -125,12 +123,12 @@ def _install_vm(vm_name, vms_dir, downloaded_file_name, os_type, original_vm_nam
|
||||
# Download the virtual machine image
|
||||
logger.info("Downloading the virtual machine image...")
|
||||
downloaded_size = 0
|
||||
|
||||
# sometimes the system is 'Darwin' but the machine is x86-based.
|
||||
if os_type == "Ubuntu":
|
||||
if platform.system() == 'Darwin':
|
||||
URL = UBUNTU_ARM_URL
|
||||
elif platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = UBUNTU_X86_URL
|
||||
elif platform.system() == 'Darwin':
|
||||
URL = UBUNTU_ARM_URL
|
||||
elif os_type == "Windows":
|
||||
if platform.machine().lower() in ['amd64', 'x86_64']:
|
||||
URL = WINDOWS_X86_URL
|
||||
|
||||
@@ -253,14 +253,136 @@ def run_single_example_autoglm(agent, env, example, max_steps, instruction, args
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
|
||||
if not done: # not completed the task yet
|
||||
env.action_history.append('FAIL')
|
||||
# Invalid Action
|
||||
if not actions:
|
||||
obs = env._get_obs() # update observation
|
||||
|
||||
step_idx += 1
|
||||
|
||||
if not done: # not completed the task yet
|
||||
env.action_history.append('FAIL')
|
||||
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
def run_single_example_mano(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
agent.reset(runtime_logger)
|
||||
env.reset(task_config=example)
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
|
||||
with open(os.path.join(example_result_dir, f"step_0.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs
|
||||
)
|
||||
if len(actions) > 1:
|
||||
if (("pyautogui.hotkey('shift')" in actions[0] or "pyautogui.hotkey('ctrl')" in actions[0])
|
||||
and "pyautogui.click" in actions[1]):
|
||||
hotkey_type = 'shift' if "shift" in actions[0] else 'ctrl'
|
||||
action = f"pyautogui.keyDown('{hotkey_type}')\n{actions[1]}\npyautogui.keyUp('{hotkey_type}')"
|
||||
actions = [action]
|
||||
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png",
|
||||
"response":response
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
|
||||
def run_single_example_uipath(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
|
||||
runtime_logger = setup_logger(example, example_result_dir)
|
||||
try:
|
||||
agent.reset(runtime_logger)
|
||||
except Exception as e:
|
||||
agent.reset()
|
||||
|
||||
env.reset(task_config=example)
|
||||
|
||||
time.sleep(60) # Wait for the environment to be ready
|
||||
obs = env._get_obs() # Get the initial observation
|
||||
done = False
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
while not done and step_idx < max_steps:
|
||||
response, actions = agent.predict(
|
||||
instruction,
|
||||
obs,
|
||||
args,
|
||||
step_idx
|
||||
)
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
_f.write(obs['screenshot'])
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_idx + 1,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"response": response,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_idx + 1}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
step_idx += 1
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
|
||||
7
mm_agents/autoglm_v/__init__.py
Normal file
7
mm_agents/autoglm_v/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
"""
|
||||
AutoGLM agent implementation
|
||||
"""
|
||||
|
||||
from .main import AutoGLMAgent
|
||||
|
||||
__all__ = ["AutoGLMAgent"]
|
||||
265
mm_agents/autoglm_v/main.py
Normal file
265
mm_agents/autoglm_v/main.py
Normal file
@@ -0,0 +1,265 @@
|
||||
import logging
|
||||
import re
|
||||
from base64 import b64encode
|
||||
from PIL import Image
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
from .prompt.accessibility_tree_handle import linearize_accessibility_tree, trim_accessibility_tree
|
||||
from .prompt.grounding_agent import GroundingAgent as Agent
|
||||
from .tools.package.google_chrome import BrowserTools
|
||||
from .prompt.procedural_memory import Prompt
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
pure_text_settings = ["a11y_tree"]
|
||||
|
||||
def resize_image(image, w, h):
|
||||
img = Image.open(BytesIO(image))
|
||||
# resize to max_pixel_num max_pixels
|
||||
img = img.resize((w, h))
|
||||
buf = BytesIO()
|
||||
img.save(buf, format='PNG') # 指定保存格式,比如 PNG、JPEG
|
||||
img_bytes = buf.getvalue() # 得到 bytes 数据
|
||||
return img_bytes
|
||||
|
||||
def parse_code_from_string(input_string):
|
||||
# input_string = "\n".join([line.strip() for line in input_string.split(';') if line.strip()])
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return [input_string.strip()]
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = ["WAIT", "DONE", "FAIL"] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
return codes
|
||||
|
||||
|
||||
class AutoGLMAgent:
|
||||
def __init__(
|
||||
self,
|
||||
action_space="autoglm_computer_use",
|
||||
observation_type="a11y_tree",
|
||||
max_trajectory_length=3,
|
||||
a11y_tree_max_items=300,
|
||||
with_image: bool = True,
|
||||
screen_size = (1920, 1080),
|
||||
image_size=(1920, 1080),
|
||||
with_atree: bool = False,
|
||||
glm41v_format: bool = True,
|
||||
relative_coordinate: bool = True,
|
||||
client_password="password",
|
||||
gen_func=None,
|
||||
tool_in_sys_msg: bool = True,
|
||||
):
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
assert action_space in ["autoglm_computer_use"], "Invalid action space"
|
||||
assert observation_type in ["a11y_tree"], "Invalid observation type"
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.a11y_tree_max_items = a11y_tree_max_items
|
||||
self.with_image = with_image
|
||||
self.screen_size = screen_size
|
||||
self.image_size = image_size
|
||||
self.with_atree = with_atree
|
||||
self.glm41v_format = glm41v_format
|
||||
self.relative_coordinate = relative_coordinate
|
||||
self.client_password = client_password
|
||||
self.gen_func = gen_func
|
||||
self.tool_in_sys_msg = tool_in_sys_msg
|
||||
|
||||
self.tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools",
|
||||
}
|
||||
|
||||
Agent.relative_coordinate = relative_coordinate
|
||||
|
||||
self.contents = []
|
||||
|
||||
@property
|
||||
def turn_number(self):
|
||||
return len(self.contents)
|
||||
|
||||
def prepare(self, instruction: str, obs: Dict, history: List, last_result: str = "") -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
if "exe_result" in obs and not last_result:
|
||||
last_result = obs["exe_result"]
|
||||
if self.contents:
|
||||
self.contents[-1]["exe_result"] = last_result
|
||||
|
||||
cur_app = obs["cur_app"]
|
||||
logger.info(f"current app is {cur_app}")
|
||||
|
||||
if cur_app:
|
||||
tool_name = cur_app.strip().lower().replace("-", "_")
|
||||
tool_name = tool_name if tool_name in self.tool_list.keys() else None
|
||||
else:
|
||||
tool_name = None
|
||||
|
||||
setup_prompt, func_def_prompt, note_prompt = Prompt.construct_procedural_memory(
|
||||
Agent, app_name=tool_name, client_password=self.client_password, with_image=self.with_image, with_atree=self.with_atree, relative_coordinate=self.relative_coordinate, glm41v_format=self.glm41v_format
|
||||
)
|
||||
if self.tool_in_sys_msg:
|
||||
system_message = setup_prompt + "\n\n" + func_def_prompt + "\n\n" + note_prompt
|
||||
else:
|
||||
system_message = setup_prompt + "\n\n" + note_prompt
|
||||
system_message += "\n\n**IMPORTANT** You are asked to complete the following task: {}".format(instruction)
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": system_message,
|
||||
}
|
||||
]
|
||||
messages.extend(history)
|
||||
|
||||
if obs["apps"]:
|
||||
app_str = "Window ID App Name Title\n"
|
||||
for window_id, app in obs["apps"].items():
|
||||
app_str += f"{window_id} {app['app_name']} {app['title']}\n"
|
||||
else:
|
||||
app_str = "None"
|
||||
|
||||
last_result = last_result.strip() if last_result else "None"
|
||||
last_result = last_result[:2000] + "..." if len(last_result) > 2000 else last_result
|
||||
|
||||
tree = linearize_accessibility_tree(obs["accessibility_tree"], "Ubuntu")
|
||||
tree = trim_accessibility_tree(tree, 300)
|
||||
|
||||
app_info = obs["app_info"].strip() if obs["app_info"] else "None"
|
||||
app_info = app_info[:5000] + "..." if len(app_info) > 5000 else app_info
|
||||
|
||||
prompt = "* Apps: {}\n\n* Current App: {}{}\n\n* App Info: {}\n\n* Previous Action Result: {}".format(
|
||||
app_str.strip(),
|
||||
obs["cur_window_id"].strip() if obs["cur_window_id"] in app_str else "None",
|
||||
'\n\n* A11y Tree: {}'.format(tree.strip()) if self.with_atree else "",
|
||||
app_info,
|
||||
last_result if last_result else "None",
|
||||
) + (
|
||||
"\n\n" + func_def_prompt if not self.tool_in_sys_msg else ""
|
||||
)
|
||||
|
||||
content = [{"type": "text", "text": prompt}]
|
||||
if self.with_image and obs.get('screenshot'):
|
||||
screenshot = resize_image(obs['screenshot'], self.image_size[0], self.image_size[1])
|
||||
content = [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/png;base64,{b64encode(screenshot).decode('utf-8')}",
|
||||
"detail": "high",
|
||||
},
|
||||
}
|
||||
] + content
|
||||
|
||||
messages.append({"role": "user", "content": content})
|
||||
|
||||
return messages
|
||||
|
||||
def execute(self, response, obs):
|
||||
try:
|
||||
actions = parse_code_from_string(response)
|
||||
action = actions[0]
|
||||
logger.info(f"The pesudo action is {action}")
|
||||
|
||||
if "Agent." in action:
|
||||
actions = [
|
||||
eval(action),
|
||||
]
|
||||
elif "BrowserTools." in action: # TODO: special check for BrowserTools
|
||||
actions = [
|
||||
eval(action),
|
||||
]
|
||||
else:
|
||||
actions = Agent.tool_commands(action, obs["cur_app"].strip().replace("-", "_").lower())
|
||||
logger.info(f"The grounded action is {actions[0]}")
|
||||
except Exception as e:
|
||||
print("Failed to parse action from response", e)
|
||||
actions = []
|
||||
|
||||
return actions
|
||||
|
||||
def format_history(self, max_turns=30):
|
||||
history = []
|
||||
for ix in range(self.turn_number):
|
||||
if ix == 0:
|
||||
env_input = "**Environment State (Omitted)**"
|
||||
else:
|
||||
env_input = (
|
||||
f"**Environment State (Omitted)**\nPrevious Action Result: {self.contents[ix - 1]['exe_result']}"
|
||||
)
|
||||
|
||||
env_input = env_input[:2000] + "..." if len(env_input) > 2000 else env_input
|
||||
response = (
|
||||
self.contents[ix]["response"][:1500] + "..."
|
||||
if len(self.contents[ix]["response"]) > 1500
|
||||
else self.contents[ix]["response"]
|
||||
)
|
||||
history.append({"role": "user", "content": [{"type": "text", "text": env_input}]})
|
||||
history.append({"role": "assistant", "content": [{"type": "text", "text": response}]})
|
||||
|
||||
return history[-max_turns * 2:]
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> List:
|
||||
history = self.format_history()
|
||||
messages = self.prepare(instruction, obs, history)
|
||||
|
||||
assert self.gen_func is not None, "gen_func is not set"
|
||||
try:
|
||||
response = self.gen_func(messages)
|
||||
except Exception as e:
|
||||
logger.error("Failed to call gen_func, Error: " + str(e))
|
||||
response = ""
|
||||
|
||||
logger.info("RESPONSE: %s", response)
|
||||
|
||||
actions = self.execute(response, obs)
|
||||
|
||||
# update the contents
|
||||
self.contents.append(
|
||||
{
|
||||
"instruction": instruction,
|
||||
"index": len(self.contents),
|
||||
"response": response,
|
||||
"action": "Parse error" if not actions else actions[0],
|
||||
"exe_result": "Invalid action" if not actions else "",
|
||||
**obs,
|
||||
}
|
||||
)
|
||||
return response, actions
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = _logger if _logger is not None else logging.getLogger("desktopenv.aguvis_agent")
|
||||
|
||||
self.contents = []
|
||||
329
mm_agents/autoglm_v/prompt/accessibility_tree_handle.py
Normal file
329
mm_agents/autoglm_v/prompt/accessibility_tree_handle.py
Normal file
@@ -0,0 +1,329 @@
|
||||
import io
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import List, Tuple
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from .deduplicate_node import filter_similar_nodes
|
||||
|
||||
attributes_ns_ubuntu = "https://accessibility.windows.example.org/ns/attributes"
|
||||
attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
|
||||
state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
|
||||
state_ns_windows = "https://accessibility.windows.example.org/ns/state"
|
||||
component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
|
||||
component_ns_windows = "https://accessibility.windows.example.org/ns/component"
|
||||
value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
|
||||
value_ns_windows = "https://accessibility.windows.example.org/ns/value"
|
||||
class_ns_windows = "https://accessibility.windows.example.org/ns/class"
|
||||
|
||||
|
||||
def find_leaf_nodes(xlm_file_str):
|
||||
if not xlm_file_str:
|
||||
return []
|
||||
|
||||
root = ET.fromstring(xlm_file_str)
|
||||
|
||||
# Recursive function to traverse the XML tree and collect leaf nodes
|
||||
def collect_leaf_nodes(node, leaf_nodes):
|
||||
# If the node has no children, it is a leaf node, add it to the list
|
||||
if not list(node):
|
||||
leaf_nodes.append(node)
|
||||
# If the node has children, recurse on each child
|
||||
for child in node:
|
||||
collect_leaf_nodes(child, leaf_nodes)
|
||||
|
||||
# List to hold all leaf nodes
|
||||
leaf_nodes = []
|
||||
collect_leaf_nodes(root, leaf_nodes)
|
||||
return leaf_nodes
|
||||
|
||||
|
||||
def judge_node(node: ET, platform="Ubuntu", check_image=False) -> bool:
|
||||
if platform == "Ubuntu":
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
keeps: bool = (
|
||||
node.tag.startswith("document")
|
||||
or node.tag.endswith("item")
|
||||
or node.tag.endswith("button")
|
||||
or node.tag.endswith("heading")
|
||||
or node.tag.endswith("label")
|
||||
or node.tag.endswith("scrollbar")
|
||||
or node.tag.endswith("searchbox")
|
||||
or node.tag.endswith("textbox")
|
||||
or node.tag.endswith("link")
|
||||
or node.tag.endswith("tabelement")
|
||||
or node.tag.endswith("textfield")
|
||||
or node.tag.endswith("textarea")
|
||||
or node.tag.endswith("menu")
|
||||
or node.tag
|
||||
in {
|
||||
"alert",
|
||||
"canvas",
|
||||
"check-box",
|
||||
"combo-box",
|
||||
"entry",
|
||||
"icon",
|
||||
"image",
|
||||
"paragraph",
|
||||
"scroll-bar",
|
||||
"section",
|
||||
"slider",
|
||||
"static",
|
||||
"table-cell",
|
||||
"terminal",
|
||||
"text",
|
||||
"netuiribbontab",
|
||||
"start",
|
||||
"trayclockwclass",
|
||||
"traydummysearchcontrol",
|
||||
"uiimage",
|
||||
"uiproperty",
|
||||
"uiribboncommandbar",
|
||||
}
|
||||
)
|
||||
keeps = (
|
||||
keeps
|
||||
and (
|
||||
platform == "Ubuntu"
|
||||
and node.get("{{{:}}}showing".format(_state_ns), "false") == "true"
|
||||
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
|
||||
or platform == "Windows"
|
||||
and node.get("{{{:}}}visible".format(_state_ns), "false") == "true"
|
||||
)
|
||||
and (
|
||||
node.get("name", "") != ""
|
||||
or node.text is not None
|
||||
and len(node.text) > 0
|
||||
or check_image
|
||||
and node.get("image", "false") == "true"
|
||||
)
|
||||
)
|
||||
# and (
|
||||
# node.get("{{{:}}}enabled".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}editable".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}expandable".format(_state_ns), "false") == "true"
|
||||
# or node.get("{{{:}}}checkable".format(_state_ns), "false") == "true"
|
||||
# ) \
|
||||
|
||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
|
||||
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(_component_ns), "(-1, -1)"))
|
||||
keeps = keeps and coordinates[0] >= 0 and coordinates[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
|
||||
return keeps
|
||||
|
||||
|
||||
def filter_nodes(root: ET, platform="Ubuntu", check_image=False):
|
||||
filtered_nodes = []
|
||||
|
||||
for node in root.iter():
|
||||
if judge_node(node, platform, check_image):
|
||||
filtered_nodes.append(node)
|
||||
|
||||
return filtered_nodes
|
||||
|
||||
|
||||
def draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0, platform="Ubuntu"):
|
||||
|
||||
if platform == "Ubuntu":
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
_value_ns = value_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
_value_ns = value_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
# Load the screenshot image
|
||||
image_stream = io.BytesIO(image_file_content)
|
||||
image = Image.open(image_stream)
|
||||
if float(down_sampling_ratio) != 1.0:
|
||||
image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
|
||||
draw = ImageDraw.Draw(image)
|
||||
marks = []
|
||||
drew_nodes = []
|
||||
text_informations: List[str] = ["index\ttag\tname\ttext"]
|
||||
|
||||
try:
|
||||
# Adjust the path to the font file you have or use a default one
|
||||
font = ImageFont.truetype("arial.ttf", 15)
|
||||
except IOError:
|
||||
# Fallback to a basic font if the specified font can't be loaded
|
||||
font = ImageFont.load_default()
|
||||
|
||||
index = 1
|
||||
|
||||
# Loop over all the visible nodes and draw their bounding boxes
|
||||
for _node in nodes:
|
||||
coords_str = _node.attrib.get("{{{:}}}screencoord".format(_component_ns))
|
||||
size_str = _node.attrib.get("{{{:}}}size".format(_component_ns))
|
||||
|
||||
if coords_str and size_str:
|
||||
try:
|
||||
# Parse the coordinates and size from the strings
|
||||
coords = tuple(map(int, coords_str.strip("()").split(", ")))
|
||||
size = tuple(map(int, size_str.strip("()").split(", ")))
|
||||
|
||||
import copy
|
||||
|
||||
original_coords = copy.deepcopy(coords)
|
||||
original_size = copy.deepcopy(size)
|
||||
|
||||
if float(down_sampling_ratio) != 1.0:
|
||||
# Downsample the coordinates and size
|
||||
coords = tuple(int(coord * down_sampling_ratio) for coord in coords)
|
||||
size = tuple(int(s * down_sampling_ratio) for s in size)
|
||||
|
||||
# Check for negative sizes
|
||||
if size[0] <= 0 or size[1] <= 0:
|
||||
raise ValueError(f"Size must be positive, got: {size}")
|
||||
|
||||
# Calculate the bottom-right corner of the bounding box
|
||||
bottom_right = (coords[0] + size[0], coords[1] + size[1])
|
||||
|
||||
# Check that bottom_right > coords (x1 >= x0, y1 >= y0)
|
||||
if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
|
||||
raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")
|
||||
|
||||
# Check if the area only contains one color
|
||||
cropped_image = image.crop((*coords, *bottom_right))
|
||||
if len(set(list(cropped_image.getdata()))) == 1:
|
||||
continue
|
||||
|
||||
# Draw rectangle on image
|
||||
draw.rectangle([coords, bottom_right], outline="red", width=1)
|
||||
|
||||
# Draw index number at the bottom left of the bounding box with black background
|
||||
text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right
|
||||
text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
|
||||
# offset: int = bottom_right[1]-text_bbox[3]
|
||||
# text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
|
||||
|
||||
# draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
|
||||
draw.rectangle(text_bbox, fill="black")
|
||||
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
|
||||
|
||||
# each mark is an x, y, w, h tuple
|
||||
marks.append([original_coords[0], original_coords[1], original_size[0], original_size[1]])
|
||||
drew_nodes.append(_node)
|
||||
|
||||
if _node.text:
|
||||
node_text = _node.text if '"' not in _node.text else '"{:}"'.format(_node.text.replace('"', '""'))
|
||||
elif _node.get("{{{:}}}class".format(class_ns_windows), "").endswith("EditWrapper") and _node.get(
|
||||
"{{{:}}}value".format(_value_ns)
|
||||
):
|
||||
node_text = _node.get("{{{:}}}value".format(_value_ns), "")
|
||||
node_text = node_text if '"' not in node_text else '"{:}"'.format(node_text.replace('"', '""'))
|
||||
else:
|
||||
node_text = '""'
|
||||
text_information: str = "{:d}\t{:}\t{:}\t{:}".format(index, _node.tag, _node.get("name", ""), node_text)
|
||||
text_informations.append(text_information)
|
||||
|
||||
index += 1
|
||||
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
output_image_stream = io.BytesIO()
|
||||
image.save(output_image_stream, format="PNG")
|
||||
image_content = output_image_stream.getvalue()
|
||||
|
||||
return marks, drew_nodes, "\n".join(text_informations), image_content
|
||||
|
||||
|
||||
def print_nodes_with_indent(nodes, indent=0):
|
||||
for node in nodes:
|
||||
print(" " * indent, node.tag, node.attrib)
|
||||
print_nodes_with_indent(node, indent + 2)
|
||||
|
||||
|
||||
def find_active_applications(tree, state_ns):
|
||||
apps_with_active_tag = []
|
||||
for application in list(tree.getroot()):
|
||||
app_name = application.attrib.get("name")
|
||||
for frame in application:
|
||||
is_active = frame.attrib.get("{{{:}}}active".format(state_ns), "false")
|
||||
if is_active == "true":
|
||||
apps_with_active_tag.append(app_name)
|
||||
if apps_with_active_tag:
|
||||
to_keep = apps_with_active_tag + ["gnome-shell"]
|
||||
else:
|
||||
to_keep = ["gjs", "gnome-shell"]
|
||||
return to_keep
|
||||
|
||||
|
||||
def linearize_accessibility_tree(accessibility_tree, platform="Ubuntu"):
|
||||
if platform == "Ubuntu":
|
||||
_attributes_ns = attributes_ns_ubuntu
|
||||
_state_ns = state_ns_ubuntu
|
||||
_component_ns = component_ns_ubuntu
|
||||
_value_ns = value_ns_ubuntu
|
||||
elif platform == "Windows":
|
||||
_attributes_ns = attributes_ns_windows
|
||||
_state_ns = state_ns_windows
|
||||
_component_ns = component_ns_windows
|
||||
_value_ns = value_ns_windows
|
||||
else:
|
||||
raise ValueError("Invalid platform, must be 'Ubuntu' or 'Windows'")
|
||||
|
||||
try:
|
||||
tree = ET.ElementTree(ET.fromstring(accessibility_tree))
|
||||
keep_apps = find_active_applications(tree, _state_ns)
|
||||
|
||||
# Remove inactive applications
|
||||
for application in list(tree.getroot()):
|
||||
if application.get("name") not in keep_apps:
|
||||
tree.getroot().remove(application)
|
||||
|
||||
filtered_nodes = filter_nodes(tree.getroot(), platform, check_image=True)
|
||||
linearized_accessibility_tree = ["tag\ttext\tposition (center x & y)\tsize (w & h)"]
|
||||
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
for node in filtered_nodes:
|
||||
try:
|
||||
text = node.text if node.text is not None else ""
|
||||
text = text.strip()
|
||||
name = node.get("name", "").strip()
|
||||
if text == "":
|
||||
text = name
|
||||
elif name != "" and text != name:
|
||||
text = f"{name} ({text})"
|
||||
|
||||
text = text.replace("\n", "\\n")
|
||||
pos = node.get("{{{:}}}screencoord".format(_component_ns), "")
|
||||
size = node.get("{{{:}}}size".format(_component_ns), "")
|
||||
|
||||
x, y = re.match(f"\((\d+), (\d+)\)", pos).groups()
|
||||
w, h = re.match(f"\((\d+), (\d+)\)", size).groups()
|
||||
x_mid, y_mid = int(x) + int(w) // 2, int(y) + int(h) // 2
|
||||
|
||||
linearized_accessibility_tree.append(
|
||||
"{:}\t{:}\t{:}\t{:}".format(node.tag, text, f"({x_mid}, {y_mid})", size)
|
||||
)
|
||||
except Exception as e:
|
||||
continue
|
||||
|
||||
# Filter out similar nodes
|
||||
linearized_accessibility_tree = filter_similar_nodes("\n".join(linearized_accessibility_tree))
|
||||
except Exception as e:
|
||||
print(f"Error in linearize_accessibility_tree: {e}")
|
||||
linearized_accessibility_tree = ""
|
||||
|
||||
return linearized_accessibility_tree
|
||||
|
||||
|
||||
def trim_accessibility_tree(linearized_accessibility_tree, max_items):
|
||||
lines = linearized_accessibility_tree.strip().split("\n")
|
||||
if len(lines) > max_items:
|
||||
lines = lines[:max_items]
|
||||
linearized_accessibility_tree = "\n".join(lines)
|
||||
linearized_accessibility_tree += "\n..."
|
||||
return linearized_accessibility_tree
|
||||
100
mm_agents/autoglm_v/prompt/deduplicate_node.py
Normal file
100
mm_agents/autoglm_v/prompt/deduplicate_node.py
Normal file
@@ -0,0 +1,100 @@
|
||||
import re
|
||||
|
||||
|
||||
def parse_line(line):
|
||||
# 解析格式,如:label Google Chrome (191, 13) (104, 17)
|
||||
pattern = r"^(\S+)\s+(.+?)\s+\((\d+), (\d+)\)\s+\((\d+), (\d+)\)"
|
||||
m = re.match(pattern, line)
|
||||
if not m:
|
||||
return None
|
||||
node_type, text, cx, cy, w, h = m.groups()
|
||||
cx, cy, w, h = map(int, (cx, cy, w, h))
|
||||
# bounding box as (x1, y1, x2, y2)
|
||||
x1 = cx - w // 2
|
||||
y1 = cy - h // 2
|
||||
x2 = x1 + w
|
||||
y2 = y1 + h
|
||||
return {
|
||||
"type": node_type,
|
||||
"text": text.strip(),
|
||||
"bbox": (x1, y1, x2, y2),
|
||||
"center": (cx, cy),
|
||||
"size": (w, h),
|
||||
"raw": line,
|
||||
}
|
||||
|
||||
|
||||
def iou(box1, box2):
|
||||
# box: (x1, y1, x2, y2)
|
||||
xi1 = max(box1[0], box2[0])
|
||||
yi1 = max(box1[1], box2[1])
|
||||
xi2 = min(box1[2], box2[2])
|
||||
yi2 = min(box1[3], box2[3])
|
||||
inter_width = max(0, xi2 - xi1)
|
||||
inter_height = max(0, yi2 - yi1)
|
||||
inter_area = inter_width * inter_height
|
||||
area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
|
||||
area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
|
||||
union = area1 + area2 - inter_area
|
||||
if union == 0:
|
||||
return 0
|
||||
return inter_area / union
|
||||
|
||||
|
||||
def norm_text(s):
|
||||
# 归一化文本:小写、去空格等
|
||||
return re.sub(r"\s+", "", s.lower())
|
||||
|
||||
|
||||
def text_similarity(a, b):
|
||||
# 简单判定:完全一致为1,否则0
|
||||
na, nb = norm_text(a), norm_text(b)
|
||||
if na == nb:
|
||||
return 1.0
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def filter_similar_nodes(linearized_accessibility_tree):
|
||||
lines = [ln for ln in linearized_accessibility_tree.split("\n") if ln.strip()]
|
||||
# parse all nodes
|
||||
nodes = []
|
||||
for ln in lines:
|
||||
node = parse_line(ln)
|
||||
if node:
|
||||
nodes.append(node)
|
||||
else:
|
||||
# 解析不了的保留
|
||||
nodes.append({"raw": ln, "invalid": True})
|
||||
filtered = []
|
||||
removed = [False] * len(nodes)
|
||||
# 阈值可自行调整
|
||||
IOU_THRESH = 0.2
|
||||
TEXT_THRESH = 0.9
|
||||
for i, ni in enumerate(nodes):
|
||||
if ni.get("invalid"):
|
||||
filtered.append(ni["raw"])
|
||||
continue
|
||||
if removed[i]:
|
||||
continue
|
||||
for j in range(i + 1, len(nodes)):
|
||||
nj = nodes[j]
|
||||
if nj.get("invalid"):
|
||||
continue
|
||||
iou_val = iou(ni["bbox"], nj["bbox"])
|
||||
text_sim = text_similarity(ni["text"], nj["text"])
|
||||
if iou_val > IOU_THRESH and text_sim > TEXT_THRESH:
|
||||
# 二者极其相似,移除后者
|
||||
removed[j] = True
|
||||
# print(f"移除: {nj['raw']} (与 {ni['raw']} 相似度高)")
|
||||
# 保留未被标记为移除的
|
||||
if not removed[i]:
|
||||
filtered.append(ni["raw"])
|
||||
return "\n".join(filtered)
|
||||
|
||||
|
||||
# 示例用法
|
||||
if __name__ == "__main__":
|
||||
linearized_accessibility_tree = "tag\ttext\tposition (center x & y)\tsize (w & h)\nicon\t\t(1853, 1001)\t(64, 64)\nlabel\tHome\t(1853, 1045)\t(40, 17)\nlabel\tActivities\t(49, 13)\t(63, 17)\ntext\tActivities\t(49, 13)\t(63, 17)\nlabel\tApr 17 17∶04\t(995, 13)\t(117, 27)\ntext\tApr 17 17∶04\t(995, 13)\t(87, 18)\nmenu\tSystem\t(1867, 13)\t(106, 27)\npush-button\tGoogle Chrome\t(35, 65)\t(70, 64)\npush-button\tThunderbird Mail\t(35, 133)\t(70, 64)\npush-button\tVisual Studio Code\t(35, 201)\t(70, 64)\npush-button\tVLC media player\t(35, 269)\t(70, 64)\npush-button\tLibreOffice Writer\t(35, 337)\t(70, 64)\npush-button\tLibreOffice Calc\t(35, 405)\t(70, 64)\npush-button\tLibreOffice Impress\t(35, 473)\t(70, 64)\npush-button\tGNU Image Manipulation Program\t(35, 541)\t(70, 64)\npush-button\tFiles\t(35, 609)\t(70, 64)\npush-button\tUbuntu Software\t(35, 677)\t(70, 64)\npush-button\tHelp\t(35, 745)\t(70, 64)\npush-button\tTrash\t(35, 816)\t(70, 64)\ntoggle-button\tShow Applications\t(35, 1045)\t(70, 70)"
|
||||
result = filter_similar_nodes(linearized_accessibility_tree)
|
||||
print(result)
|
||||
260
mm_agents/autoglm_v/prompt/grounding_agent.py
Normal file
260
mm_agents/autoglm_v/prompt/grounding_agent.py
Normal file
@@ -0,0 +1,260 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
def agent_action(func):
|
||||
func.is_agent_action = True
|
||||
return func
|
||||
|
||||
|
||||
switch_window_code = """import subprocess;
|
||||
import pyautogui;
|
||||
pyautogui.press('escape');
|
||||
time.sleep(0.5);
|
||||
subprocess.run(['wmctrl', '-ia', 'WINDOW_ID'])
|
||||
subprocess.run(['wmctrl', '-ir', 'WINDOW_ID', '-b', 'add,maximized_vert,maximized_horz'])
|
||||
print('Switch to WINDOW_ID')"""
|
||||
|
||||
launch_app_commands = {
|
||||
# Web Browser
|
||||
"chrome": "google-chrome --remote-debugging-port=1337",
|
||||
# File Manager
|
||||
"files": "nautilus",
|
||||
# Terminal
|
||||
"terminal": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-terminal',
|
||||
# Utilities
|
||||
"gedit": "gedit",
|
||||
# Office
|
||||
"libreoffice writer": "libreoffice --writer",
|
||||
"libreoffice calc": "libreoffice --calc",
|
||||
"libreoffice impress": "libreoffice --impress",
|
||||
# System
|
||||
"settings": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-control-center',
|
||||
# Multimedia
|
||||
"vlc": "vlc",
|
||||
"gimp": "gimp",
|
||||
# IDE
|
||||
"vs code": "code",
|
||||
# Email
|
||||
"thunderbird": "thunderbird",
|
||||
}
|
||||
|
||||
|
||||
class GroundingAgent:
|
||||
|
||||
tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools",
|
||||
}
|
||||
|
||||
relative_coordinate = True # whether the coordinates are relative (0-1000) or absolute (e.g. 1920x1080)
|
||||
|
||||
@classmethod
|
||||
def tool_commands(cls, code: str, tool_name: str):
|
||||
command = f"from {tool_name} import *; "
|
||||
command += code
|
||||
|
||||
tool_class = cls.tool_list[tool_name]
|
||||
command += f"; {tool_class}.print_result()"
|
||||
|
||||
return [
|
||||
command,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def click(
|
||||
cls,
|
||||
coordinate: List,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
):
|
||||
"""
|
||||
Click on the element
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to click on
|
||||
num_clicks (int): number of times to click the element
|
||||
button_type (str): which mouse button to press ("left", "middle", or "right")
|
||||
"""
|
||||
command = ""
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
command += f"""pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); print("Click Success")""" # TODO: 最大化窗口需要一次调用
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def type(
|
||||
cls,
|
||||
coordinate: Optional[List] = None,
|
||||
text: str = "",
|
||||
overwrite: bool = False,
|
||||
enter: bool = False,
|
||||
):
|
||||
"""
|
||||
Type text into the element
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to type into. If None, typing starts at current cursor location
|
||||
text (str): the text to type
|
||||
overwrite (bool): True to overwrite existing text, False otherwise
|
||||
enter (bool): True to press enter after typing, False otherwise
|
||||
"""
|
||||
|
||||
command = ""
|
||||
|
||||
if coordinate is not None:
|
||||
# Start typing at the center of the element
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
command += f"pyautogui.click({x}, {y}); "
|
||||
|
||||
if overwrite:
|
||||
command += f"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); "
|
||||
|
||||
command += f"pyautogui.write({repr(text)}); "
|
||||
|
||||
if enter:
|
||||
command += "pyautogui.press('enter'); "
|
||||
|
||||
command += "print('Type Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def drag_and_drop(cls, drag_from_coordinate: List, drop_on_coordinate: List):
|
||||
"""
|
||||
Drag element1 and drop it on element2
|
||||
|
||||
Args:
|
||||
drag_from_coordinate (List): [x, y], coordinate of element to drag
|
||||
drop_on_coordinate (List): [x, y], coordinate of element to drop on
|
||||
"""
|
||||
x1, y1 = drag_from_coordinate
|
||||
if cls.relative_coordinate:
|
||||
x1, y1 = round(x1 * 1920 / 1000), round(y1 * 1080 / 1000)
|
||||
x2, y2 = drop_on_coordinate
|
||||
if cls.relative_coordinate:
|
||||
x2, y2 = round(x2 * 1920 / 1000), round(y2 * 1080 / 1000)
|
||||
|
||||
command = f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# TODO: specified duration?
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); "
|
||||
|
||||
command += "print('Drag and Drop Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def scroll(cls, coordinate: List, direction: str):
|
||||
"""
|
||||
Scroll the element in the specified direction
|
||||
|
||||
Args:
|
||||
coordinate (List): [x, y], coordinate of the element to scroll in
|
||||
direction (str): the direction to scroll ("up" or "down")
|
||||
"""
|
||||
x, y = coordinate
|
||||
if cls.relative_coordinate:
|
||||
x, y = round(x * 1920 / 1000), round(y * 1080 / 1000)
|
||||
amount = 100 if direction == "up" else -100
|
||||
return f"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({amount}); print('Scroll Success')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def open_app(cls, app_name: str):
|
||||
"""
|
||||
Open a specified application
|
||||
|
||||
Supported apps: chrome, files, terminal, gedit, libreoffice writer,
|
||||
libreoffice calc, libreoffice impress, vs code, vlc, gimp, settings, thunderbird
|
||||
|
||||
Args:
|
||||
app_name (str): name of the application to open
|
||||
"""
|
||||
|
||||
app_name = app_name.lower().strip()
|
||||
|
||||
if app_name not in launch_app_commands:
|
||||
command = f"print(f'{app_name} is not supported or recognized')"
|
||||
else:
|
||||
command = {
|
||||
"action_type": "OPEN_APP",
|
||||
"parameters": {"launch_app_command": launch_app_commands[app_name], "app_name": app_name},
|
||||
}
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def switch_window(cls, window_id: str):
|
||||
"""
|
||||
Switch to the window with the given window id
|
||||
|
||||
Args:
|
||||
window_id (str): the window id to switch to from the provided list of open windows
|
||||
"""
|
||||
return switch_window_code.replace("WINDOW_ID", window_id)
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def hotkey(cls, keys: List):
|
||||
"""
|
||||
Press a hotkey combination
|
||||
|
||||
Args:
|
||||
keys (List): the keys to press in combination (e.g. ['ctrl', 'c'] for copy, ['prtsc'] for screenshot)
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
key_str = ", ".join(keys).replace("'", "\\'")
|
||||
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)}); print(f'Press Hotkey: {key_str}')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def quote(cls, content: str):
|
||||
"""
|
||||
Quote information from the current page for memory
|
||||
|
||||
Args:
|
||||
content (str): text summarized or copied from the page for later operation
|
||||
"""
|
||||
return f'''print("""{content}""")'''
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def wait(cls):
|
||||
"""
|
||||
Wait for a while
|
||||
|
||||
"""
|
||||
return "WAIT"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def exit(cls, success: bool):
|
||||
"""
|
||||
End the current task
|
||||
|
||||
Args:
|
||||
success (bool): True if successfully finish a task, False otherwise
|
||||
"""
|
||||
if success:
|
||||
return "DONE"
|
||||
else:
|
||||
return "FAIL"
|
||||
194
mm_agents/autoglm_v/prompt/procedural_memory.py
Normal file
194
mm_agents/autoglm_v/prompt/procedural_memory.py
Normal file
@@ -0,0 +1,194 @@
|
||||
import inspect
|
||||
import json
|
||||
import os
|
||||
import textwrap
|
||||
|
||||
current_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
|
||||
|
||||
def generate_func(json_data):
|
||||
# 收集所有类名和它们的函数
|
||||
class_funcs = {}
|
||||
no_class_funcs = []
|
||||
cls_name = ""
|
||||
|
||||
for item in json_data:
|
||||
if item["type"] == "function":
|
||||
func = item["function"]
|
||||
func_parts = func["name"].split(".")
|
||||
|
||||
if len(func_parts) == 2:
|
||||
class_name, func_name = func_parts
|
||||
if class_name not in class_funcs:
|
||||
class_funcs[class_name] = []
|
||||
class_funcs[class_name].append(item)
|
||||
else:
|
||||
no_class_funcs.append(item)
|
||||
|
||||
code = ""
|
||||
|
||||
# 生成有类的函数
|
||||
for class_name, funcs in class_funcs.items():
|
||||
code += f"class {class_name}:\n"
|
||||
cls_name = class_name
|
||||
for item in funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"].split(".")[-1]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = ["cls"]
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}") # 可选参数默认值设为None
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f" def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if len(param_list) == 1: # 只有cls参数
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
code += "\n"
|
||||
|
||||
# 生成没有类的函数
|
||||
for item in no_class_funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = []
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}")
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f"def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if not param_list:
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
return code.strip(), cls_name
|
||||
|
||||
|
||||
setup_prompt = """You are a GUI operation agent. You will be given a task and your action history, with current observation ({observation_list}). You should help me control the computer, output the best action step by step to accomplish the task.
|
||||
You should first generate a plan, reflect on the current observation, then generate actions to complete the task in python-style pseudo code using the predefined functions.
|
||||
|
||||
* Output Format:
|
||||
{format_hint}"""
|
||||
|
||||
func_def_template = """* Available Functions:
|
||||
```python
|
||||
{class_content}
|
||||
```"""
|
||||
|
||||
note_prompt = """* Note:
|
||||
- Your code should only be wrapped in ```python```.
|
||||
- Only **ONE-LINE-OF-CODE** at a time.
|
||||
- Each code block is context independent, and variables from the previous round cannot be used in the next round.
|
||||
{relative_coordinate_hint}- Return with `Agent.exit(success=True)` immediately after the task is completed.
|
||||
- The computer's environment is Linux, e.g., Desktop path is '/home/user/Desktop'
|
||||
- My computer's password is '{client_password}', feel free to use it when you need sudo rights"""
|
||||
|
||||
|
||||
class Prompt:
|
||||
@staticmethod
|
||||
def construct_procedural_memory(agent_class, app_name=None, client_password="password", with_image=True, with_atree=False, relative_coordinate=True, glm41v_format=True):
|
||||
agent_class_content = "Class Agent:"
|
||||
for attr_name in dir(agent_class):
|
||||
attr = getattr(agent_class, attr_name)
|
||||
if callable(attr) and hasattr(attr, "is_agent_action"):
|
||||
# Use inspect to get the full function signature
|
||||
signature = inspect.signature(attr)
|
||||
agent_class_content += f"""
|
||||
def {attr_name}{signature}:
|
||||
'''{attr.__doc__}'''
|
||||
"""
|
||||
|
||||
if app_name is not None:
|
||||
tool_path = os.path.join(current_dir, "tools", "apis", f"{app_name.lower()}.json")
|
||||
with open(tool_path, "r") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
tool_class_content, tool_class_name = generate_func(json_data)
|
||||
|
||||
agent_class_content += "\n\n{}".format(tool_class_content)
|
||||
|
||||
func_def_prompt = func_def_template.format(class_content=agent_class_content.strip())
|
||||
|
||||
# --- dynamic observation list ---
|
||||
obs_items = []
|
||||
if with_image:
|
||||
obs_items.append("screenshot")
|
||||
obs_items.append("current app name")
|
||||
if with_atree:
|
||||
obs_items.append("a11y tree (based on AT-SPI library)")
|
||||
obs_items.append("app info")
|
||||
obs_items.append("last action result")
|
||||
observation_list = ", ".join(obs_items)
|
||||
|
||||
setup_prompt_formatted = setup_prompt.format(
|
||||
observation_list=observation_list,
|
||||
format_hint="<think>\n{**YOUR-PLAN-AND-THINKING**}</think>\n<answer>```python\n{**ONE-LINE-OF-CODE**}\n```</answer>" if glm41v_format else "<think>\n{**YOUR-PLAN-AND-THINKING**}\n</think>\n```python\n{**ONE-LINE-OF-CODE**}\n```"
|
||||
)
|
||||
|
||||
note_prompt_formatted = note_prompt.format(
|
||||
relative_coordinate_hint="- The coordinate [x, y] should be normalized to 0-1000, which usually should be the center of a specific target element.\n" if relative_coordinate else "",
|
||||
client_password=client_password
|
||||
)
|
||||
|
||||
return setup_prompt_formatted, func_def_prompt, note_prompt_formatted
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from grounding_agent import GroundingAgent
|
||||
|
||||
print(Prompt.construct_procedural_memory(GroundingAgent, "vlc"))
|
||||
3
mm_agents/autoglm_v/tools/apis/__init__.py
Normal file
3
mm_agents/autoglm_v/tools/apis/__init__.py
Normal file
@@ -0,0 +1,3 @@
|
||||
from .func import generate_func
|
||||
|
||||
__all__ = ["generate_func"]
|
||||
236
mm_agents/autoglm_v/tools/apis/code.json
Normal file
236
mm_agents/autoglm_v/tools/apis/code.json
Normal file
@@ -0,0 +1,236 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.launch_vscode",
|
||||
"description": "Launch VS Code with specified path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "File path or directory to open"
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.compare_files",
|
||||
"description": "Compare two files in VS Code",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file1": {
|
||||
"type": "string",
|
||||
"description": "First file path"
|
||||
},
|
||||
"file2": {
|
||||
"type": "string",
|
||||
"description": "Second file path"
|
||||
}
|
||||
},
|
||||
"required": ["file1", "file2"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.add_folder",
|
||||
"description": "Add folder to active VS Code window",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {
|
||||
"type": "string",
|
||||
"description": "Folder path to add"
|
||||
}
|
||||
},
|
||||
"required": ["folder"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.goto_file",
|
||||
"description": "Open file at specific position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File path to open"
|
||||
},
|
||||
"line": {
|
||||
"type": "integer",
|
||||
"description": "Line number",
|
||||
"default": 1
|
||||
},
|
||||
"character": {
|
||||
"type": "integer",
|
||||
"description": "Character position",
|
||||
"default": 1
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.perform_merge",
|
||||
"description": "Perform three-way merge",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path1": {
|
||||
"type": "string",
|
||||
"description": "First version file path"
|
||||
},
|
||||
"path2": {
|
||||
"type": "string",
|
||||
"description": "Second version file path"
|
||||
},
|
||||
"base": {
|
||||
"type": "string",
|
||||
"description": "Base version file path"
|
||||
},
|
||||
"result": {
|
||||
"type": "string",
|
||||
"description": "Output file path"
|
||||
}
|
||||
},
|
||||
"required": ["path1", "path2", "base", "result"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.remove_folder",
|
||||
"description": "Remove folder from active VS Code window",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"folder": {
|
||||
"type": "string",
|
||||
"description": "Folder path to remove"
|
||||
}
|
||||
},
|
||||
"required": ["folder"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.install_extension",
|
||||
"description": "Install or update VS Code extension",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
},
|
||||
"pre_release": {
|
||||
"type": "boolean",
|
||||
"description": "Install pre-release version",
|
||||
"default": false
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.uninstall_extension",
|
||||
"description": "Uninstall VS Code extension",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.list_extensions",
|
||||
"description": "List installed extensions",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"show_versions": {
|
||||
"type": "boolean",
|
||||
"description": "Show extension versions",
|
||||
"default": false
|
||||
},
|
||||
"category": {
|
||||
"type": "string",
|
||||
"description": "Filter by category"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.update_extensions",
|
||||
"description": "Update all extensions to latest version",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.disable_extension",
|
||||
"description": "Disable extension for next VS Code instance",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"extension_id": {
|
||||
"type": "string",
|
||||
"description": "Extension identifier"
|
||||
}
|
||||
},
|
||||
"required": ["extension_id"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CodeTools.toggle_sync",
|
||||
"description": "Toggle VS Code synchronization",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"state": {
|
||||
"type": "string",
|
||||
"description": "Sync state",
|
||||
"enum": ["on", "off"]
|
||||
}
|
||||
},
|
||||
"required": ["state"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
117
mm_agents/autoglm_v/tools/apis/func.py
Normal file
117
mm_agents/autoglm_v/tools/apis/func.py
Normal file
@@ -0,0 +1,117 @@
|
||||
def generate_func(json_data):
|
||||
# 收集所有类名和它们的函数
|
||||
class_funcs = {}
|
||||
no_class_funcs = []
|
||||
|
||||
for item in json_data:
|
||||
if item["type"] == "function":
|
||||
func = item["function"]
|
||||
func_parts = func["name"].split(".")
|
||||
|
||||
if len(func_parts) == 2:
|
||||
class_name, func_name = func_parts
|
||||
if class_name not in class_funcs:
|
||||
class_funcs[class_name] = []
|
||||
class_funcs[class_name].append(item)
|
||||
else:
|
||||
no_class_funcs.append(item)
|
||||
|
||||
code = ""
|
||||
|
||||
# 生成有类的函数
|
||||
for class_name, funcs in class_funcs.items():
|
||||
code += f"class {class_name}:\n"
|
||||
for item in funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"].split(".")[-1]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = ["cls"]
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}") # 可选参数默认值设为None
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f" def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if len(param_list) == 1: # 只有cls参数
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
code += "\n"
|
||||
|
||||
# 生成没有类的函数
|
||||
for item in no_class_funcs:
|
||||
func = item["function"]
|
||||
func_name = func["name"]
|
||||
description = func["description"]
|
||||
params = func["parameters"]["properties"]
|
||||
required = func["parameters"].get("required", [])
|
||||
|
||||
# 构建参数列表
|
||||
param_list = []
|
||||
# 首先添加必需参数
|
||||
for param_name in required:
|
||||
param_list.append(f"{param_name}")
|
||||
# 然后添加可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_list.append(f"{param_name}")
|
||||
|
||||
# 构建函数定义
|
||||
func_def = f"def {func_name}({', '.join(param_list)}):\n"
|
||||
|
||||
# 构建文档字符串
|
||||
docstring = f' """\n {description}\n\n Args:\n'
|
||||
if not param_list:
|
||||
docstring += " None\n"
|
||||
else:
|
||||
# 首先记录必需参数
|
||||
for param_name in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}): {param_desc}\n"
|
||||
# 然后记录可选参数
|
||||
for param_name in params:
|
||||
if param_name not in required:
|
||||
param_type = params[param_name]["type"]
|
||||
param_desc = params[param_name].get("description", "")
|
||||
docstring += f" {param_name} ({param_type}, optional): {param_desc}\n"
|
||||
|
||||
docstring += ' """\n'
|
||||
|
||||
code += func_def + docstring + "\n"
|
||||
|
||||
return code.strip()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
|
||||
with open("libreoffice_calc.json", "r") as f:
|
||||
json_data = json.load(f)
|
||||
print(generate_func(json_data))
|
||||
134
mm_agents/autoglm_v/tools/apis/google_chrome.json
Normal file
134
mm_agents/autoglm_v/tools/apis/google_chrome.json
Normal file
@@ -0,0 +1,134 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_profile_settings",
|
||||
"description": "Opens profile settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_password_settings",
|
||||
"description": "Opens password/autofill settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_privacy_settings",
|
||||
"description": "Opens privacy settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_appearance_settings",
|
||||
"description": "Opens appearance settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_search_engine_settings",
|
||||
"description": "Opens search engine settings page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.bring_back_last_tab",
|
||||
"description": "Restores last-closed tab (Ctrl+Shift+T).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.print",
|
||||
"description": "Opens print dialog (Ctrl+P).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.delete_browsing_data",
|
||||
"description": "Opens clear browsing data dialog (Ctrl+Shift+Del).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_extensions",
|
||||
"description": "Opens extensions management page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.bookmark_page",
|
||||
"description": "Bookmarks current page (Ctrl+D).",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "BrowserTools.open_bookmarks",
|
||||
"description": "Opens bookmarks page.",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
634
mm_agents/autoglm_v/tools/apis/libreoffice_calc.json
Normal file
634
mm_agents/autoglm_v/tools/apis/libreoffice_calc.json
Normal file
@@ -0,0 +1,634 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.get_workbook_info",
|
||||
"description": "Get workbook info: file path, name, sheets, and active sheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.save",
|
||||
"description": "Save workbook to current location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.get_column_data",
|
||||
"description": "Get all data from specified column",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.switch_active_sheet",
|
||||
"description": "Switch to sheet (creates if not exists)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sheet_name": {
|
||||
"type": "string",
|
||||
"description": "Sheet name"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"sheet_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_column_values",
|
||||
"description": "Set values to column (values only, not formulas)",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"data": {
|
||||
"type": "array",
|
||||
"description": "Values to write"
|
||||
},
|
||||
"start_index": {
|
||||
"type": "integer",
|
||||
"description": "First row index (default: 2)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name",
|
||||
"data"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.highlight_range",
|
||||
"description": "Highlight range with color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"color": {
|
||||
"type": "integer",
|
||||
"description": "Color value (default: 0xFF0000)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.transpose_range",
|
||||
"description": "Transpose range and paste to target cell",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_range": {
|
||||
"type": "string",
|
||||
"description": "Source range (e.g. 'A1:B10')"
|
||||
},
|
||||
"target_cell": {
|
||||
"type": "string",
|
||||
"description": "Target cell (e.g. 'A1')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_range",
|
||||
"target_cell"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.export_to_csv",
|
||||
"description": "Export to CSV with same path/name",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.sort_column",
|
||||
"description": "Sort column data",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"ascending": {
|
||||
"type": "boolean",
|
||||
"description": "Sort ascending (default: true)"
|
||||
},
|
||||
"start_index": {
|
||||
"type": "integer",
|
||||
"description": "First row index (default: 2)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_validation_list",
|
||||
"description": "Set validation list for column",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_name": {
|
||||
"type": "string",
|
||||
"description": "Column name (e.g. 'A', 'B')"
|
||||
},
|
||||
"values": {
|
||||
"type": "array",
|
||||
"description": "Validation values"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_name",
|
||||
"values"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.hide_row_data",
|
||||
"description": "Hide rows containing value",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Value to hide (default: 'N/A')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.reorder_columns",
|
||||
"description": "Reorder columns by specified order",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"column_order": {
|
||||
"type": "array",
|
||||
"description": "Column names in desired order (e.g. ['A', 'B', 'C'])"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"column_order"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.create_pivot_table",
|
||||
"description": "Create pivot table from source sheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_sheet": {
|
||||
"type": "string",
|
||||
"description": "Source sheet name"
|
||||
},
|
||||
"table_name": {
|
||||
"type": "string",
|
||||
"description": "Pivot table name"
|
||||
},
|
||||
"row_fields": {
|
||||
"type": "array",
|
||||
"description": "Row labels (e.g. ['A', 'B'])"
|
||||
},
|
||||
"col_fields": {
|
||||
"type": "array",
|
||||
"description": "Column labels (e.g. ['A', 'B'])"
|
||||
},
|
||||
"value_fields": {
|
||||
"type": "array",
|
||||
"description": "Value fields (e.g. ['A', 'B'])"
|
||||
},
|
||||
"aggregation_function": {
|
||||
"type": "string",
|
||||
"description": "Aggregation function (sum, count, average, min, max)"
|
||||
},
|
||||
"target_cell": {
|
||||
"type": "string",
|
||||
"description": "Target cell (default: 'A1')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_sheet",
|
||||
"table_name",
|
||||
"value_fields"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.merge_cells",
|
||||
"description": "Merge cells in range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Cell range (e.g. 'A1:B10')"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_cell_value",
|
||||
"description": "Set cell value",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"cell": {
|
||||
"type": "string",
|
||||
"description": "Cell reference (e.g. 'A1')"
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Cell value"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"cell",
|
||||
"value"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.format_range",
|
||||
"description": "Apply formatting to range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"background_color": {
|
||||
"type": "string",
|
||||
"description": "Background color (e.g. '#0000ff')"
|
||||
},
|
||||
"font_color": {
|
||||
"type": "string",
|
||||
"description": "Font color (e.g. '#ffffff')"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text"
|
||||
},
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Text alignment (left, center, right)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.create_chart",
|
||||
"description": "Create chart from data range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"chart_type": {
|
||||
"type": "string",
|
||||
"description": "Chart type (bar, column, line, pie, scatter, area)"
|
||||
},
|
||||
"data_range": {
|
||||
"type": "string",
|
||||
"description": "Data range (e.g. 'A1:B10')"
|
||||
},
|
||||
"title": {
|
||||
"type": "string",
|
||||
"description": "Chart title"
|
||||
},
|
||||
"x_axis_title": {
|
||||
"type": "string",
|
||||
"description": "X axis title"
|
||||
},
|
||||
"y_axis_title": {
|
||||
"type": "string",
|
||||
"description": "Y axis title"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"chart_type",
|
||||
"data_range"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.freeze_panes",
|
||||
"description": "Freeze rows/columns",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rows": {
|
||||
"type": "integer",
|
||||
"description": "Rows to freeze from top"
|
||||
},
|
||||
"columns": {
|
||||
"type": "integer",
|
||||
"description": "Columns to freeze from left"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.rename_sheet",
|
||||
"description": "Rename worksheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"old_name": {
|
||||
"type": "string",
|
||||
"description": "Current sheet name"
|
||||
},
|
||||
"new_name": {
|
||||
"type": "string",
|
||||
"description": "New sheet name"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"old_name",
|
||||
"new_name"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.copy_sheet",
|
||||
"description": "Copy worksheet",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"source_sheet": {
|
||||
"type": "string",
|
||||
"description": "Source sheet name"
|
||||
},
|
||||
"new_sheet_name": {
|
||||
"type": "string",
|
||||
"description": "New sheet name (optional)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"source_sheet"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.reorder_sheets",
|
||||
"description": "Change sheet order",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sheet_name": {
|
||||
"type": "string",
|
||||
"description": "Sheet to move"
|
||||
},
|
||||
"position": {
|
||||
"type": "integer",
|
||||
"description": "New position (0-based)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"sheet_name",
|
||||
"position"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_chart_legend_position",
|
||||
"description": "Set chart legend position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Legend position (top, bottom, left, right, none)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"position"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_number_format",
|
||||
"description": "Apply number format to range",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"range_str": {
|
||||
"type": "string",
|
||||
"description": "Range (e.g. 'A1:B10')"
|
||||
},
|
||||
"format_type": {
|
||||
"type": "string",
|
||||
"description": "Format type (general, number, currency, accounting, date, time, percentage, fraction, scientific, text)"
|
||||
},
|
||||
"decimal_places": {
|
||||
"type": "integer",
|
||||
"description": "Decimal places (optional)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"range_str",
|
||||
"format_type"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.adjust_column_width",
|
||||
"description": "Adjust column width",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"columns": {
|
||||
"type": "string",
|
||||
"description": "Column range (e.g. 'A:C')"
|
||||
},
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Width in characters"
|
||||
},
|
||||
"autofit": {
|
||||
"type": "boolean",
|
||||
"description": "Autofit to content"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"columns"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.adjust_row_height",
|
||||
"description": "Adjust row height",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"rows": {
|
||||
"type": "string",
|
||||
"description": "Row range (e.g. '1:10')"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Height in points"
|
||||
},
|
||||
"autofit": {
|
||||
"type": "boolean",
|
||||
"description": "Autofit to content"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"rows"
|
||||
]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.export_to_pdf",
|
||||
"description": "Export to PDF",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "PDF save path (default: same as original)"
|
||||
},
|
||||
"sheets": {
|
||||
"type": "array",
|
||||
"description": "Sheets to include (default: all)"
|
||||
},
|
||||
"open_after_export": {
|
||||
"type": "boolean",
|
||||
"description": "Open PDF after export (default: false)"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "CalcTools.set_zoom_level",
|
||||
"description": "Set worksheet zoom level",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"zoom_percentage": {
|
||||
"type": "integer",
|
||||
"description": "Zoom percentage (10-400)"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"zoom_percentage"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
559
mm_agents/autoglm_v/tools/apis/libreoffice_impress.json
Normal file
559
mm_agents/autoglm_v/tools/apis/libreoffice_impress.json
Normal file
@@ -0,0 +1,559 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.save",
|
||||
"description": "Save current presentation",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.go_to_slide",
|
||||
"description": "Navigate to specific slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.get_slide_count",
|
||||
"description": "Get total slide count",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.duplicate_slide",
|
||||
"description": "Duplicate slide and place at end",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index to duplicate (1-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_font",
|
||||
"description": "Set font for all text in slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.write_text",
|
||||
"description": "Write text to textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"content": {
|
||||
"type": "string",
|
||||
"description": "Text content"
|
||||
},
|
||||
"page_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text (default: false)"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Italic text (default: false)"
|
||||
},
|
||||
"size": {
|
||||
"type": "integer",
|
||||
"description": "Font size"
|
||||
},
|
||||
"append": {
|
||||
"type": "boolean",
|
||||
"description": "Append to existing text (default: false)"
|
||||
}
|
||||
},
|
||||
"required": ["content", "page_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_style",
|
||||
"description": "Set text style for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Bold text"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Italic text"
|
||||
},
|
||||
"underline": {
|
||||
"type": "boolean",
|
||||
"description": "Underline text"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.configure_auto_save",
|
||||
"description": "Configure auto-save settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enabled": {
|
||||
"type": "boolean",
|
||||
"description": "Enable auto-save"
|
||||
},
|
||||
"interval_minutes": {
|
||||
"type": "number",
|
||||
"description": "Auto-save interval in minutes (min: 1)"
|
||||
}
|
||||
},
|
||||
"required": ["enabled", "interval_minutes"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_background_color",
|
||||
"description": "Set textbox background color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_text_color",
|
||||
"description": "Set text color for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.delete_content",
|
||||
"description": "Delete textbox from slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_orientation",
|
||||
"description": "Set slide orientation",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"orientation": {
|
||||
"type": "string",
|
||||
"description": "Slide orientation",
|
||||
"enum": ["portrait", "landscape"]
|
||||
}
|
||||
},
|
||||
"required": ["orientation"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.position_box",
|
||||
"description": "Position textbox or image on slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Box index (0-based)"
|
||||
},
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Position on slide",
|
||||
"enum": ["left", "right", "center", "top", "bottom", "top-left", "top-right", "bottom-left", "bottom-right"]
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.insert_file",
|
||||
"description": "Insert video or audio file",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File path"
|
||||
},
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"position": {
|
||||
"type": "object",
|
||||
"description": "Position coordinates",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "number",
|
||||
"description": "X position (% of slide width)"
|
||||
},
|
||||
"y": {
|
||||
"type": "number",
|
||||
"description": "Y position (% of slide height)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"size": {
|
||||
"type": "object",
|
||||
"description": "Size dimensions",
|
||||
"properties": {
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Width (% of slide width)"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Height (% of slide height)"
|
||||
}
|
||||
}
|
||||
},
|
||||
"autoplay": {
|
||||
"type": "boolean",
|
||||
"description": "Auto-play media"
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_background",
|
||||
"description": "Set slide background color or image",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based). If not provided, applies to all slides"
|
||||
},
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Background color"
|
||||
},
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Background image path (overrides color)"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.save_as",
|
||||
"description": "Save document to specified location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "File save path with filename and extension"
|
||||
},
|
||||
"overwrite": {
|
||||
"type": "boolean",
|
||||
"description": "Overwrite existing file (default: false)"
|
||||
}
|
||||
},
|
||||
"required": ["file_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.insert_image",
|
||||
"description": "Insert image to slide",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Image file path"
|
||||
},
|
||||
"width": {
|
||||
"type": "number",
|
||||
"description": "Image width in cm"
|
||||
},
|
||||
"height": {
|
||||
"type": "number",
|
||||
"description": "Image height in cm"
|
||||
},
|
||||
"position": {
|
||||
"type": "object",
|
||||
"description": "Position coordinates",
|
||||
"properties": {
|
||||
"x": {
|
||||
"type": "number",
|
||||
"description": "X position (% of slide width)"
|
||||
},
|
||||
"y": {
|
||||
"type": "number",
|
||||
"description": "Y position (% of slide height)"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "image_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.configure_display_settings",
|
||||
"description": "Configure presentation display settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"use_presenter_view": {
|
||||
"type": "boolean",
|
||||
"description": "Use presenter view"
|
||||
},
|
||||
"primary_monitor_only": {
|
||||
"type": "boolean",
|
||||
"description": "Use primary monitor only"
|
||||
},
|
||||
"monitor_for_presentation": {
|
||||
"type": "integer",
|
||||
"description": "Monitor number for presentation"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_slide_number_color",
|
||||
"description": "Set slide number color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"color": {
|
||||
"type": "string",
|
||||
"description": "Color name or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_text_strikethrough",
|
||||
"description": "Apply strikethrough formatting to text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"line_numbers": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"type": "integer"
|
||||
},
|
||||
"description": "Line numbers for strikethrough (1-based)"
|
||||
},
|
||||
"apply": {
|
||||
"type": "boolean",
|
||||
"description": "Apply or remove strikethrough"
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "line_numbers", "apply"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.set_textbox_alignment",
|
||||
"description": "Set text alignment for textbox",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Slide index (1-based)"
|
||||
},
|
||||
"box_index": {
|
||||
"type": "integer",
|
||||
"description": "Textbox index (0-based)"
|
||||
},
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Text alignment",
|
||||
"enum": ["left", "center", "right", "justify"]
|
||||
}
|
||||
},
|
||||
"required": ["slide_index", "box_index", "alignment"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "ImpressTools.export_to_image",
|
||||
"description": "Export presentation or slide to image",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"file_path": {
|
||||
"type": "string",
|
||||
"description": "Image save path with filename and extension"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Image format",
|
||||
"enum": ["png", "jpeg", "jpg", "gif", "bmp", "tiff"]
|
||||
},
|
||||
"slide_index": {
|
||||
"type": "integer",
|
||||
"description": "Specific slide index (1-based). If not provided, exports all slides"
|
||||
}
|
||||
},
|
||||
"required": ["file_path", "format"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
412
mm_agents/autoglm_v/tools/apis/libreoffice_writer.json
Normal file
412
mm_agents/autoglm_v/tools/apis/libreoffice_writer.json
Normal file
@@ -0,0 +1,412 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.save",
|
||||
"description": "Save document to current location",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.write_text",
|
||||
"description": "Write text at cursor position",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"text": {
|
||||
"type": "string",
|
||||
"description": "Text to write"
|
||||
},
|
||||
"bold": {
|
||||
"type": "boolean",
|
||||
"description": "Apply bold formatting"
|
||||
},
|
||||
"italic": {
|
||||
"type": "boolean",
|
||||
"description": "Apply italic formatting"
|
||||
},
|
||||
"size": {
|
||||
"type": "number",
|
||||
"description": "Font size"
|
||||
}
|
||||
},
|
||||
"required": ["text"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_color",
|
||||
"description": "Change text color using regex pattern",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"color": {
|
||||
"type": "number",
|
||||
"description": "Hex color code (e.g., 0x000000)"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern", "color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.find_and_replace",
|
||||
"description": "Find and replace text using regex",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to find"
|
||||
},
|
||||
"replacement": {
|
||||
"type": "string",
|
||||
"description": "Replacement text"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern", "replacement"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_font",
|
||||
"description": "Change font family",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Font name (e.g., 'Arial', 'Times New Roman')"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_line_spacing",
|
||||
"description": "Set line spacing",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"spacing_value": {
|
||||
"type": "number",
|
||||
"description": "Spacing value (1.0=single, 2.0=double)"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["spacing_value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.remove_highlighting",
|
||||
"description": "Remove text highlighting",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.find_highlighted_text",
|
||||
"description": "Find text with specific highlight color",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"highlight_color": {
|
||||
"type": "string",
|
||||
"description": "Color name (e.g., 'yellow') or hex code"
|
||||
}
|
||||
},
|
||||
"required": ["highlight_color"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_formula_at_cursor",
|
||||
"description": "Insert formula at cursor",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"formula": {
|
||||
"type": "string",
|
||||
"description": "Formula to insert"
|
||||
}
|
||||
},
|
||||
"required": ["formula"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_image_at_cursor",
|
||||
"description": "Insert image at cursor",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"image_path": {
|
||||
"type": "string",
|
||||
"description": "Full path to image file"
|
||||
},
|
||||
"width": {
|
||||
"type": "integer",
|
||||
"description": "Display width in pixels"
|
||||
},
|
||||
"height": {
|
||||
"type": "integer",
|
||||
"description": "Display height in pixels"
|
||||
}
|
||||
},
|
||||
"required": ["image_path"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_strikethrough",
|
||||
"description": "Apply strikethrough formatting",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["pattern"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_font_size",
|
||||
"description": "Change font size",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_size": {
|
||||
"type": "number",
|
||||
"description": "Font size in points"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["font_size", "pattern"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.export_to_pdf",
|
||||
"description": "Export document to PDF",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"output_path": {
|
||||
"type": "string",
|
||||
"description": "PDF save path"
|
||||
},
|
||||
"output_filename": {
|
||||
"type": "string",
|
||||
"description": "PDF filename"
|
||||
},
|
||||
"include_comments": {
|
||||
"type": "boolean",
|
||||
"description": "Include comments in PDF"
|
||||
},
|
||||
"quality": {
|
||||
"type": "string",
|
||||
"description": "Export quality ('standard', 'high', 'print')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_paragraph_alignment",
|
||||
"description": "Set paragraph alignment",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"alignment": {
|
||||
"type": "string",
|
||||
"description": "Alignment type ('left', 'center', 'right', 'justify')"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["alignment"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.capitalize_words",
|
||||
"description": "Capitalize first letter of each word",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.set_default_font",
|
||||
"description": "Set default font for new text",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"font_name": {
|
||||
"type": "string",
|
||||
"description": "Default font name"
|
||||
},
|
||||
"font_size": {
|
||||
"type": "number",
|
||||
"description": "Default font size in points"
|
||||
}
|
||||
},
|
||||
"required": ["font_name"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.add_page_numbers",
|
||||
"description": "Add page numbers",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Position ('bottom_left', 'bottom_center', 'bottom_right', 'top_left', 'top_center', 'top_right')"
|
||||
},
|
||||
"start_number": {
|
||||
"type": "integer",
|
||||
"description": "Starting page number"
|
||||
},
|
||||
"format": {
|
||||
"type": "string",
|
||||
"description": "Number format (e.g., '1', 'Page 1', '1 of N')"
|
||||
}
|
||||
},
|
||||
"required": ["position"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.insert_page_break",
|
||||
"description": "Insert page break",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"position": {
|
||||
"type": "string",
|
||||
"description": "Insert location ('at_cursor', 'end_of_document')"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "WriterTools.change_text_case",
|
||||
"description": "Change text case",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"case_type": {
|
||||
"type": "string",
|
||||
"description": "Case type ('lowercase', 'uppercase')"
|
||||
},
|
||||
"pattern": {
|
||||
"type": "string",
|
||||
"description": "Regex pattern to match"
|
||||
},
|
||||
"paragraph_indices": {
|
||||
"type": "array",
|
||||
"description": "Target paragraph indices (0-based). Applies to all if omitted"
|
||||
}
|
||||
},
|
||||
"required": ["case_type", "pattern"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
166
mm_agents/autoglm_v/tools/apis/vlc.json
Normal file
166
mm_agents/autoglm_v/tools/apis/vlc.json
Normal file
@@ -0,0 +1,166 @@
|
||||
[
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_playlist",
|
||||
"description": "Get current playlist with track info",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.play",
|
||||
"description": "Start playing current media",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.pause",
|
||||
"description": "Pause current media",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.next",
|
||||
"description": "Switch to next track",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.previous",
|
||||
"description": "Switch to previous track",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.add_to_playlist",
|
||||
"description": "Add media file to playlist",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"uri": {
|
||||
"type": "string",
|
||||
"description": "Media file URI (file:// or https://)"
|
||||
}
|
||||
},
|
||||
"required": ["uri"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_current_time",
|
||||
"description": "Get current playback position in seconds",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_media_duration",
|
||||
"description": "Get media duration in seconds",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.toggle_fullscreen",
|
||||
"description": "Toggle or set fullscreen mode",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"enable": {
|
||||
"type": "boolean",
|
||||
"description": "Force fullscreen on/off, omit to toggle"
|
||||
}
|
||||
},
|
||||
"required": []
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_settings",
|
||||
"description": "Get VLC settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {}
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.set_settings",
|
||||
"description": "Set VLC settings",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"field": {
|
||||
"type": "string",
|
||||
"description": "Setting name (e.g. qt-max-volume, qt-minimal-view)"
|
||||
},
|
||||
"value": {
|
||||
"type": "string",
|
||||
"description": "Setting value (use 0/1 for booleans)"
|
||||
}
|
||||
},
|
||||
"required": ["field", "value"]
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "VLCTools.get_media_files",
|
||||
"description": "Get media files from path",
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"path": {
|
||||
"type": "string",
|
||||
"description": "Directory path"
|
||||
},
|
||||
"suffix": {
|
||||
"type": "array",
|
||||
"description": "File extensions, default: ['mp4','avi','mkv','mov','mp3','m4a','wav']"
|
||||
}
|
||||
},
|
||||
"required": ["path"]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
260
mm_agents/autoglm_v/tools/package/code.py
Normal file
260
mm_agents/autoglm_v/tools/package/code.py
Normal file
@@ -0,0 +1,260 @@
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class CodeTools:
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
"""打印执行结果"""
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def launch_vscode(cls, path):
|
||||
"""
|
||||
Launches Visual Studio Code with the specified file path or directory.
|
||||
在存在的窗口中打开一个文件或目录。
|
||||
|
||||
Args:
|
||||
path (str): 文件路径或目录。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-r", path], check=True)
|
||||
cls.ret = "Successfully launched VS Code"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error launching VS Code: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
@classmethod
|
||||
def compare_files(cls, file1, file2):
|
||||
"""
|
||||
Compares two files in VSCode.
|
||||
在VSCode中比较两个文件。
|
||||
|
||||
Args:
|
||||
file1 (str): 第一个文件的路径。
|
||||
file2 (str): 第二个文件的路径。
|
||||
"""
|
||||
try:
|
||||
# 获取compare结果
|
||||
subprocess.run(["code", "-d", file1, file2], check=True)
|
||||
cls.ret = "The compared files are opened in VSCode"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error comparing files: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def add_folder(cls, folder):
|
||||
"""
|
||||
Adds a folder to the last active window in VSCode.
|
||||
向VSCode的最后一个活动窗口添加文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-a", folder], check=True)
|
||||
cls.ret = "Successfully added folder"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error adding folder: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def goto_file(cls, file_path, line=1, character=1):
|
||||
"""
|
||||
Opens a file at a specific line and character position.
|
||||
在特定行和字符的位置打开文件。
|
||||
|
||||
Args:
|
||||
file_path (str): 文件路径。
|
||||
line (int): 行号。
|
||||
character (int): 字符位置。
|
||||
"""
|
||||
try:
|
||||
command = f"{file_path}:{line}:{character}"
|
||||
subprocess.run(["code", "-g", command], check=True)
|
||||
cls.ret = "Successfully opened file, line: {}, character: {}".format(line, character)
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error going to file: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def perform_merge(cls, path1, path2, base, result):
|
||||
"""
|
||||
Perform a three-way merge.
|
||||
执行三方合并。
|
||||
|
||||
Args:
|
||||
path1 (str): 第一版本文件路径。
|
||||
path2 (str): 第二版本文件路径。
|
||||
base (str): 基础版本文件路径。
|
||||
result (str): 结果文件的保存路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "-m", path1, path2, base, result], check=True)
|
||||
cls.ret = "Successfully performed merge"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error performing merge: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def remove_folder(cls, folder):
|
||||
"""
|
||||
Removes a folder from the last active window in VSCode.
|
||||
在VSCode的最后一个活动窗口中移除文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--remove", folder], check=True)
|
||||
cls.ret = "Successfully removed folder"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error removing folder: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def install_extension(cls, extension_id, pre_release=False):
|
||||
"""
|
||||
Installs an extension or updates it in VSCode.
|
||||
安装或更新VSCode中的扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
pre_release (bool): 是否安装预发布版本。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--install-extension", extension_id]
|
||||
if pre_release:
|
||||
command.append("--pre-release")
|
||||
subprocess.run(command, check=True)
|
||||
cls.ret = "Successfully installed extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error installing extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def uninstall_extension(cls, extension_id):
|
||||
"""
|
||||
Uninstalls an extension from VSCode.
|
||||
从VSCode中卸载扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--uninstall-extension", extension_id], check=True)
|
||||
cls.ret = "Successfully uninstalled extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error uninstalling extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def list_extensions(cls, show_versions=False, category=None):
|
||||
"""
|
||||
Lists installed extensions in VSCode.
|
||||
列出VSCode中安装的扩展。
|
||||
|
||||
Args:
|
||||
show_versions (bool): 是否显示扩展的版本。
|
||||
category (str): 按类别筛选扩展。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--list-extensions"]
|
||||
if show_versions:
|
||||
command.append("--show-versions")
|
||||
if category:
|
||||
command.extend(["--category", category])
|
||||
cls.ret = subprocess.run(command, check=True, capture_output=True, text=True).stdout
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error listing extensions: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def update_extensions(cls):
|
||||
"""
|
||||
Updates all installed extensions in VSCode to the latest version.
|
||||
更新VSCode中所有安装的扩展到最新版本。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--update-extensions"], check=True)
|
||||
cls.ret = "Successfully updated extensions"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error updating extensions: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def disable_extension(cls, extension_id):
|
||||
"""
|
||||
Disables a specific extension for the next instance of VSCode.
|
||||
禁用在下一个VSCode窗口中的指定扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
"""
|
||||
try:
|
||||
subprocess.run(["code", "--disable-extension", extension_id], check=True)
|
||||
cls.ret = "Successfully disabled extension"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error disabling extension: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def toggle_sync(cls, state):
|
||||
"""
|
||||
Toggles synchronization on or off in VSCode.
|
||||
在VSCode中开启或关闭同步。
|
||||
|
||||
Args:
|
||||
state (str): 'on' 或 'off' 表示开启或关闭。
|
||||
"""
|
||||
try:
|
||||
command = ["code", "--sync", state]
|
||||
subprocess.run(command, check=True)
|
||||
cls.ret = "Successfully toggled sync"
|
||||
except subprocess.CalledProcessError as e:
|
||||
cls.ret = f"Error toggling sync: {e}"
|
||||
except Exception as e:
|
||||
cls.ret = f"Unexpected error: {e}"
|
||||
|
||||
return cls.ret
|
||||
107
mm_agents/autoglm_v/tools/package/google_chrome.py
Normal file
107
mm_agents/autoglm_v/tools/package/google_chrome.py
Normal file
@@ -0,0 +1,107 @@
|
||||
class BrowserTools:
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
# @classmethod
|
||||
# def show_all_tabs(cls):
|
||||
# cls.ret = "Browser not found"
|
||||
# for attempt in range(3):
|
||||
# with sync_playwright() as p:
|
||||
# try:
|
||||
# browser = p.chromium.connect_over_cdp(cls.remote_debugging_url)
|
||||
# if not browser:
|
||||
# continue
|
||||
# context = browser.contexts[0]
|
||||
# # 获取所有窗口名称
|
||||
# cls.ret = 'Browser Tabs: '
|
||||
# for idx, page in enumerate(context.pages):
|
||||
# cls.ret += f"{idx}. {page.title()} ({page.url})" + '\n'
|
||||
# return cls.ret
|
||||
# except TimeoutError:
|
||||
# cls.ret = 'Failed to get browser tabs'
|
||||
# return None
|
||||
# return None
|
||||
|
||||
@classmethod
|
||||
def open_profile_settings(cls):
|
||||
"""
|
||||
Open the profile settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/people"]}}
|
||||
|
||||
@classmethod
|
||||
def open_password_settings(cls):
|
||||
"""
|
||||
Open the password settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/autofill"]}}
|
||||
|
||||
@classmethod
|
||||
def open_privacy_settings(cls):
|
||||
"""
|
||||
Open the privacy settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/privacy"]}}
|
||||
|
||||
@classmethod
|
||||
def open_appearance_settings(cls):
|
||||
"""
|
||||
Open the appearance settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/appearance"]}}
|
||||
|
||||
@classmethod
|
||||
def open_search_engine_settings(cls):
|
||||
"""
|
||||
Open the search engine settings page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://settings/search"]}}
|
||||
|
||||
@classmethod
|
||||
def bring_back_last_tab(cls):
|
||||
"""
|
||||
Bring back the last tab in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 't'); print('Brought back last tab')"
|
||||
|
||||
@classmethod
|
||||
def print(cls):
|
||||
"""
|
||||
Open the print option in current page.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'p'); print('Opened print option')"
|
||||
|
||||
@classmethod
|
||||
def delete_browsing_data(cls):
|
||||
"""
|
||||
Delete browsing data in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'shift', 'del'); print('Deleted browsing data')"
|
||||
|
||||
@classmethod
|
||||
def open_extensions(cls):
|
||||
"""
|
||||
open the extensions page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://extensions"]}}
|
||||
|
||||
@classmethod
|
||||
def bookmark_page(cls):
|
||||
"""
|
||||
Bookmark the current page in the browser.
|
||||
"""
|
||||
return f"import pyautogui; pyautogui.hotkey('ctrl', 'd'); print('Bookmarked page')"
|
||||
|
||||
@classmethod
|
||||
def open_bookmarks(cls):
|
||||
"""
|
||||
Open the bookmarks page in the browser.
|
||||
"""
|
||||
return {"action_type": "OPEN_CHROME_TAB", "parameters": {"urls_to_open": ["chrome://bookmarks"]}}
|
||||
1322
mm_agents/autoglm_v/tools/package/libreoffice_calc.py
Normal file
1322
mm_agents/autoglm_v/tools/package/libreoffice_calc.py
Normal file
File diff suppressed because it is too large
Load Diff
1424
mm_agents/autoglm_v/tools/package/libreoffice_impress.py
Normal file
1424
mm_agents/autoglm_v/tools/package/libreoffice_impress.py
Normal file
File diff suppressed because it is too large
Load Diff
753
mm_agents/autoglm_v/tools/package/libreoffice_writer.py
Normal file
753
mm_agents/autoglm_v/tools/package/libreoffice_writer.py
Normal file
@@ -0,0 +1,753 @@
|
||||
import os
|
||||
import re
|
||||
|
||||
import uno
|
||||
from com.sun.star.awt.FontSlant import ITALIC, NONE, OBLIQUE
|
||||
from com.sun.star.awt.FontWeight import BOLD, NORMAL
|
||||
from com.sun.star.beans import PropertyValue
|
||||
from com.sun.star.style.ParagraphAdjust import CENTER, LEFT, RIGHT
|
||||
from com.sun.star.text.ControlCharacter import PARAGRAPH_BREAK
|
||||
from com.sun.star.text.TextContentAnchorType import AS_CHARACTER
|
||||
|
||||
|
||||
class WriterTools:
|
||||
localContext = uno.getComponentContext()
|
||||
resolver = localContext.ServiceManager.createInstanceWithContext("com.sun.star.bridge.UnoUrlResolver", localContext)
|
||||
ctx = resolver.resolve("uno:socket,host=localhost,port=2002;urp;StarOffice.ComponentContext")
|
||||
desktop = ctx.ServiceManager.createInstanceWithContext("com.sun.star.frame.Desktop", ctx)
|
||||
doc = desktop.getCurrentComponent()
|
||||
text = doc.Text
|
||||
cursor = text.createTextCursor()
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def close_other_window(cls):
|
||||
"""关闭除当前文档外的所有文档"""
|
||||
components = cls.desktop.getComponents().createEnumeration()
|
||||
current_url = cls.doc.getURL()
|
||||
while components.hasMoreElements():
|
||||
doc = components.nextElement()
|
||||
if doc.getURL() != current_url:
|
||||
doc.close(True)
|
||||
|
||||
@classmethod
|
||||
def save(cls):
|
||||
"""保存文档到当前位置"""
|
||||
try:
|
||||
if cls.doc.hasLocation():
|
||||
cls.doc.store()
|
||||
else:
|
||||
raise Exception("文档没有保存位置,请使用另存为功能")
|
||||
return True
|
||||
except Exception as e:
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def maximize_window(cls):
|
||||
"""
|
||||
将窗口设置为工作区最大尺寸
|
||||
使用工作区域大小(考虑任务栏等)
|
||||
"""
|
||||
window = cls.doc.getCurrentController().getFrame().getContainerWindow()
|
||||
toolkit = window.getToolkit()
|
||||
device = toolkit.createScreenCompatibleDevice(0, 0)
|
||||
workarea = toolkit.getWorkArea()
|
||||
window.setPosSize(workarea.X, workarea.Y, workarea.Width, workarea.Height, 15)
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def write_text(cls, text, bold=False, italic=False, size=None):
|
||||
"""写入文本"""
|
||||
cls.cursor.CharWeight = 150 if bold else 100
|
||||
cls.cursor.CharPosture = ITALIC if italic else NONE
|
||||
if size:
|
||||
cls.cursor.CharHeight = size
|
||||
cls.text.insertString(cls.cursor, text, False)
|
||||
cls.ret = "Success"
|
||||
|
||||
@classmethod
|
||||
def get_paragraphs(cls, start_index=0, count=None):
|
||||
"""Retrieves paragraphs from the document as a list."""
|
||||
text = cls.doc.getText()
|
||||
paragraphs = text.createEnumeration()
|
||||
paragraph_list = []
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraph_list.append(paragraph.getString())
|
||||
if start_index < 0:
|
||||
start_index = 0
|
||||
elif start_index >= len(paragraph_list):
|
||||
cls.ret = []
|
||||
if count is not None:
|
||||
end_index = min(start_index + count, len(paragraph_list))
|
||||
cls.ret = paragraph_list[start_index:end_index]
|
||||
else:
|
||||
cls.ret = paragraph_list[start_index:]
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
paras = cls.get_paragraphs()
|
||||
para_str = ""
|
||||
for i, para in enumerate(paras):
|
||||
para = para[:500] + "..." if len(para) > 500 else para
|
||||
para_str += "Paragraph " + str(i) + ": " + para.strip() + "\n"
|
||||
cls.ret = para_str
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_color(cls, pattern, color, paragraph_indices=None):
|
||||
"""
|
||||
Changes the color of matched text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
pattern (str): Regular expression pattern to match text
|
||||
color (int): Hex color code (e.g., 0x000000 for black)
|
||||
paragraph_indices (list, optional): List of paragraph indices to modify (0-based).
|
||||
If None, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
enum = cls.doc.Text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
if not paragraph_indices:
|
||||
paragraphs_to_process = range(len(paragraphs))
|
||||
else:
|
||||
paragraphs_to_process = paragraph_indices
|
||||
regex = re.compile(pattern)
|
||||
for idx in paragraphs_to_process:
|
||||
if idx < 0 or idx >= len(paragraphs):
|
||||
continue
|
||||
paragraph = paragraphs[idx]
|
||||
if not paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
continue
|
||||
para_text = paragraph.getString()
|
||||
matches = regex.finditer(para_text)
|
||||
for match in matches:
|
||||
para_cursor = cls.text.createTextCursorByRange(paragraph.getStart())
|
||||
para_cursor.goRight(match.start(), False)
|
||||
para_cursor.goRight(match.end() - match.start(), True)
|
||||
para_cursor.CharColor = color
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def find_and_replace(cls, pattern, replacement, paragraph_indices=None):
|
||||
"""
|
||||
Finds all occurrences of a specified text pattern and replaces them with another text in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The pattern to match in the document, should be a regular expression
|
||||
replacement (str): The text to replace the found text with
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing)
|
||||
|
||||
Returns:
|
||||
str: Success message with number of replacements made
|
||||
"""
|
||||
try:
|
||||
enum = cls.doc.Text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
total_replacements = 0
|
||||
if not paragraph_indices:
|
||||
paragraphs_to_process = list(range(len(paragraphs)))
|
||||
else:
|
||||
paragraphs_to_process = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
|
||||
regex = re.compile(pattern)
|
||||
for idx in paragraphs_to_process:
|
||||
if idx >= len(paragraphs):
|
||||
continue
|
||||
paragraph = paragraphs[idx]
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
text_content = paragraph.getString()
|
||||
new_text, count = regex.subn(replacement, text_content)
|
||||
if count > 0:
|
||||
paragraph.setString(new_text)
|
||||
total_replacements += count
|
||||
cls.ret = f"Successfully made {total_replacements} replacements"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error during find and replace: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_font(cls, font_name, paragraph_indices=None):
|
||||
"""
|
||||
Changes the font of text in the document or specified paragraphs.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to apply (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraphs.append(enum.nextElement())
|
||||
if not paragraph_indices:
|
||||
paragraph_indices = range(len(paragraphs))
|
||||
for idx in paragraph_indices:
|
||||
if 0 <= idx < len(paragraphs):
|
||||
paragraph = paragraphs[idx]
|
||||
cursor = text.createTextCursorByRange(paragraph)
|
||||
cursor.CharFontName = font_name
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def set_line_spacing(cls, spacing_value, paragraph_indices=None):
|
||||
"""
|
||||
Sets the line spacing for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
spacing_value (float): The line spacing value to apply (1.0 for single spacing, 2.0 for double spacing, etc.)
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
paragraph_enum = text.createEnumeration()
|
||||
line_spacing_value = int(spacing_value * 100)
|
||||
current_index = 0
|
||||
|
||||
while paragraph_enum.hasMoreElements():
|
||||
paragraph = paragraph_enum.nextElement()
|
||||
|
||||
if not paragraph_indices or current_index in paragraph_indices:
|
||||
line_spacing = uno.createUnoStruct("com.sun.star.style.LineSpacing")
|
||||
line_spacing.Mode = 0
|
||||
line_spacing.Height = line_spacing_value
|
||||
paragraph.ParaLineSpacing = line_spacing
|
||||
|
||||
if paragraph.String.strip():
|
||||
current_index += 1
|
||||
|
||||
cls.ret = "Success"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def remove_highlighting(cls, paragraph_indices=None):
|
||||
"""
|
||||
Removes ALL highlighting from text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
paragraphs = text.createEnumeration()
|
||||
target_indices = set(paragraph_indices) if paragraph_indices else None
|
||||
current_index = 0
|
||||
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if target_indices is None or current_index in target_indices:
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
para_cursor = text.createTextCursorByRange(paragraph)
|
||||
# Remove all highlighting by setting back color to -1
|
||||
para_cursor.CharBackColor = -1
|
||||
|
||||
# Additional cleanup for individual text portions (optional)
|
||||
text_portions = paragraph.createEnumeration()
|
||||
while text_portions.hasMoreElements():
|
||||
text_portion = text_portions.nextElement()
|
||||
if hasattr(text_portion, "CharBackColor"):
|
||||
portion_cursor = text.createTextCursorByRange(text_portion)
|
||||
portion_cursor.CharBackColor = -1
|
||||
current_index += 1
|
||||
|
||||
cls.ret = "Successfully removed all highlighting"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error removing highlighting: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def find_highlighted_text(cls, highlight_color):
|
||||
"""
|
||||
Finds all text in the document that has a specific highlight color applied to it.
|
||||
|
||||
Args:
|
||||
highlight_color (str): The highlight color to search for. Can be a color name (e.g., 'yellow', 'green') or hex code.
|
||||
|
||||
Returns:
|
||||
list: A list of strings containing all text segments with the specified highlight color.
|
||||
"""
|
||||
color_map = {
|
||||
"yellow": 16776960,
|
||||
"green": 65280,
|
||||
"blue": 255,
|
||||
"red": 16711680,
|
||||
"cyan": 65535,
|
||||
"magenta": 16711935,
|
||||
"black": 0,
|
||||
"white": 16777215,
|
||||
"gray": 8421504,
|
||||
"lightgray": 12632256,
|
||||
}
|
||||
target_color = None
|
||||
if highlight_color.lower() in color_map:
|
||||
target_color = color_map[highlight_color.lower()]
|
||||
elif highlight_color.startswith("#") and len(highlight_color) == 7:
|
||||
try:
|
||||
hex_color = highlight_color[1:]
|
||||
r = int(hex_color[0:2], 16)
|
||||
g = int(hex_color[2:4], 16)
|
||||
b = int(hex_color[4:6], 16)
|
||||
target_color = (r << 16) + (g << 8) + b
|
||||
except ValueError:
|
||||
cls.ret = f"Invalid hex color format: {highlight_color}"
|
||||
return []
|
||||
else:
|
||||
cls.ret = f"Unsupported color format: {highlight_color}"
|
||||
return []
|
||||
highlighted_text = []
|
||||
text = cls.doc.getText()
|
||||
enum_paragraphs = text.createEnumeration()
|
||||
while enum_paragraphs.hasMoreElements():
|
||||
paragraph = enum_paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
enum_portions = paragraph.createEnumeration()
|
||||
while enum_portions.hasMoreElements():
|
||||
text_portion = enum_portions.nextElement()
|
||||
if hasattr(text_portion, "CharBackColor") and text_portion.CharBackColor == target_color:
|
||||
if text_portion.getString().strip():
|
||||
highlighted_text.append(text_portion.getString())
|
||||
cls.ret = f"Found {len(highlighted_text)} text segments with highlight color {highlight_color}"
|
||||
return highlighted_text
|
||||
|
||||
@classmethod
|
||||
def insert_formula_at_cursor(cls, formula):
|
||||
"""
|
||||
Inserts a formula at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
formula (str): The formula to insert at the current cursor position.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise
|
||||
"""
|
||||
try:
|
||||
embedded_obj = cls.doc.createInstance("com.sun.star.text.TextEmbeddedObject")
|
||||
embedded_obj.setPropertyValue("CLSID", "078B7ABA-54FC-457F-8551-6147e776a997")
|
||||
embedded_obj.setPropertyValue("AnchorType", AS_CHARACTER)
|
||||
cls.text.insertTextContent(cls.cursor, embedded_obj, False)
|
||||
math_obj = embedded_obj.getEmbeddedObject()
|
||||
math_obj.Formula = formula
|
||||
cls.ret = "Formula inserted successfully"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error inserting formula: {str(e)}"
|
||||
return False
|
||||
|
||||
@classmethod
|
||||
def insert_image_at_cursor(cls, image_path, width=None, height=None):
|
||||
"""
|
||||
Inserts an image at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
image_path (str): Full path to the image file to insert
|
||||
width (int, optional): Width to display the image in pixels
|
||||
height (int, optional): Height to display the image in pixels
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
if image_path.startswith("~"):
|
||||
image_path = os.path.expanduser(image_path)
|
||||
if not os.path.exists(image_path):
|
||||
cls.ret = f"Error: Image file not found at {image_path}"
|
||||
return cls.ret
|
||||
image_path = os.path.abspath(image_path)
|
||||
if os.name == "nt":
|
||||
file_url = "file:///" + image_path.replace("\\", "/")
|
||||
else:
|
||||
file_url = "file://" + image_path
|
||||
graphic = cls.doc.createInstance("com.sun.star.text.GraphicObject")
|
||||
graphic.GraphicURL = file_url
|
||||
graphic.AnchorType = AS_CHARACTER
|
||||
if width is not None:
|
||||
graphic.Width = width * 100
|
||||
if height is not None:
|
||||
graphic.Height = height * 100
|
||||
cls.text.insertTextContent(cls.cursor, graphic, False)
|
||||
cls.ret = "Success: Image inserted"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_strikethrough(cls, pattern, paragraph_indices=None):
|
||||
"""
|
||||
Sets the strikethrough formatting for text matching the specified pattern in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The regular expression pattern to match in the document
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error information
|
||||
"""
|
||||
try:
|
||||
paragraphs = cls.doc.getText().createEnumeration()
|
||||
para_index = 0
|
||||
found_matches = 0
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
if paragraph_indices and para_index not in paragraph_indices:
|
||||
para_index += 1
|
||||
continue
|
||||
para_text = paragraph.getString()
|
||||
matches = list(re.finditer(pattern, para_text))
|
||||
for match in matches:
|
||||
text_range = paragraph.getStart()
|
||||
cursor = cls.doc.getText().createTextCursorByRange(text_range)
|
||||
cursor.goRight(match.start(), False)
|
||||
cursor.goRight(match.end() - match.start(), True)
|
||||
cursor.CharStrikeout = 1
|
||||
found_matches += 1
|
||||
para_index += 1
|
||||
cls.ret = f"Successfully applied strikethrough to {found_matches} matches of pattern: {pattern}"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error applying strikethrough: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_font_size(cls, font_size, pattern, paragraph_indices=None):
|
||||
"""
|
||||
Changes the font size of specified text in the document.
|
||||
|
||||
Args:
|
||||
font_size (float): The font size to apply (in points).
|
||||
pattern (str): The pattern to match in the document, should be a regular expression.
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Result message indicating success or failure.
|
||||
"""
|
||||
try:
|
||||
regex = re.compile(pattern)
|
||||
paragraphs = cls.doc.getText().createEnumeration()
|
||||
current_index = 0
|
||||
while paragraphs.hasMoreElements():
|
||||
paragraph = paragraphs.nextElement()
|
||||
if paragraph_indices and current_index not in paragraph_indices:
|
||||
current_index += 1
|
||||
continue
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
para_cursor = cls.text.createTextCursorByRange(paragraph)
|
||||
para_text = paragraph.getString()
|
||||
matches = list(regex.finditer(para_text))
|
||||
for match in reversed(matches):
|
||||
start_pos = match.start()
|
||||
end_pos = match.end()
|
||||
para_cursor.gotoStart(False)
|
||||
para_cursor.goRight(start_pos, False)
|
||||
para_cursor.goRight(end_pos - start_pos, True)
|
||||
para_cursor.CharHeight = font_size
|
||||
current_index += 1
|
||||
cls.ret = f"Successfully changed font size to {font_size} for text matching '{pattern}'"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error changing font size: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def export_to_pdf(cls, output_path=None, output_filename=None, include_comments=False, quality="standard"):
|
||||
"""
|
||||
Exports the current document to PDF format.
|
||||
|
||||
Args:
|
||||
output_path (str, optional): The full path where the PDF should be saved.
|
||||
If not provided, uses the same location as the original document.
|
||||
output_filename (str, optional): The filename to use for the PDF.
|
||||
If not provided, uses the original document's filename with .pdf extension.
|
||||
include_comments (bool, optional): Whether to include comments in the exported PDF.
|
||||
Defaults to False.
|
||||
quality (str, optional): The quality of the PDF export ('standard', 'high', 'print').
|
||||
Defaults to 'standard'.
|
||||
|
||||
Returns:
|
||||
str: Path to the exported PDF file or error message
|
||||
"""
|
||||
try:
|
||||
doc_url = cls.doc.getURL()
|
||||
if not doc_url and not output_path:
|
||||
return "Error: Document has not been saved and no output path provided"
|
||||
if doc_url:
|
||||
doc_path = uno.fileUrlToSystemPath(os.path.dirname(doc_url))
|
||||
doc_filename = os.path.basename(doc_url)
|
||||
doc_name = os.path.splitext(doc_filename)[0]
|
||||
else:
|
||||
doc_path = ""
|
||||
doc_name = "export"
|
||||
final_path = output_path if output_path else doc_path
|
||||
final_filename = output_filename if output_filename else f"{doc_name}.pdf"
|
||||
if not final_filename.lower().endswith(".pdf"):
|
||||
final_filename += ".pdf"
|
||||
full_output_path = os.path.join(final_path, final_filename)
|
||||
output_url = uno.systemPathToFileUrl(full_output_path)
|
||||
export_props = []
|
||||
if quality == "high":
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=1))
|
||||
elif quality == "print":
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=2))
|
||||
else:
|
||||
export_props.append(PropertyValue(Name="SelectPdfVersion", Value=0))
|
||||
export_props.append(PropertyValue(Name="ExportNotes", Value=include_comments))
|
||||
export_props.extend(
|
||||
[
|
||||
PropertyValue(Name="FilterName", Value="writer_pdf_Export"),
|
||||
PropertyValue(Name="Overwrite", Value=True),
|
||||
]
|
||||
)
|
||||
cls.doc.storeToURL(output_url, tuple(export_props))
|
||||
cls.ret = f"PDF exported to: {full_output_path}"
|
||||
return full_output_path
|
||||
except Exception as e:
|
||||
cls.ret = f"Error exporting to PDF: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_paragraph_alignment(cls, alignment, paragraph_indices=None):
|
||||
"""
|
||||
Sets the text alignment for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
alignment (str): The alignment to apply ('left', 'center', 'right', 'justify').
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
alignment_map = {"left": LEFT, "center": CENTER, "right": RIGHT, "justify": 3}
|
||||
if alignment.lower() not in alignment_map:
|
||||
cls.ret = f"Error: Invalid alignment '{alignment}'. Use 'left', 'center', 'right', or 'justify'."
|
||||
return cls.ret
|
||||
alignment_value = alignment_map[alignment.lower()]
|
||||
text = cls.doc.getText()
|
||||
paragraph_enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while paragraph_enum.hasMoreElements():
|
||||
paragraph = paragraph_enum.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraphs.append(paragraph)
|
||||
if paragraph_indices:
|
||||
valid_indices = [i for i in paragraph_indices if 0 <= i < len(paragraphs)]
|
||||
if len(valid_indices) != len(paragraph_indices):
|
||||
cls.ret = f"Warning: Some paragraph indices were out of range (0-{len(paragraphs) - 1})"
|
||||
for idx in valid_indices:
|
||||
paragraphs[idx].ParaAdjust = alignment_value
|
||||
else:
|
||||
for paragraph in paragraphs:
|
||||
paragraph.ParaAdjust = alignment_value
|
||||
cls.ret = f"Successfully applied '{alignment}' alignment to paragraphs"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error setting paragraph alignment: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def capitalize_words(cls, paragraph_indices=None):
|
||||
"""
|
||||
Capitalizes the first letter of each word for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
text = cls.doc.getText()
|
||||
enum = text.createEnumeration()
|
||||
paragraphs = []
|
||||
while enum.hasMoreElements():
|
||||
paragraph = enum.nextElement()
|
||||
if paragraph.supportsService("com.sun.star.text.Paragraph"):
|
||||
paragraphs.append(paragraph)
|
||||
if not paragraph_indices:
|
||||
target_paragraphs = list(range(len(paragraphs)))
|
||||
else:
|
||||
target_paragraphs = paragraph_indices
|
||||
valid_indices = [idx for idx in target_paragraphs if 0 <= idx < len(paragraphs)]
|
||||
for idx in valid_indices:
|
||||
paragraph = paragraphs[idx]
|
||||
text_content = paragraph.getString()
|
||||
if not text_content.strip():
|
||||
continue
|
||||
capitalized_text = " ".join(word.capitalize() if word else "" for word in text_content.split(" "))
|
||||
para_cursor = text.createTextCursorByRange(paragraph.getStart())
|
||||
para_cursor.gotoRange(paragraph.getEnd(), True)
|
||||
para_cursor.setString(capitalized_text)
|
||||
cls.ret = f"Successfully capitalized words in {len(valid_indices)} paragraphs"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error capitalizing words: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_default_font(cls, font_name, font_size=None):
|
||||
"""
|
||||
Sets the default font for new text in the document without changing existing text.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to set as default (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
font_size (float, optional): The default font size in points.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
style_families = cls.doc.getStyleFamilies()
|
||||
paragraph_styles = style_families.getByName("ParagraphStyles")
|
||||
default_style_names = ["Default", "Standard", "Normal"]
|
||||
standard_style = None
|
||||
for style_name in default_style_names:
|
||||
if paragraph_styles.hasByName(style_name):
|
||||
standard_style = paragraph_styles.getByName(style_name)
|
||||
break
|
||||
if standard_style is None:
|
||||
style_names = paragraph_styles.getElementNames()
|
||||
if style_names:
|
||||
standard_style = paragraph_styles.getByName(style_names[0])
|
||||
else:
|
||||
raise Exception("Could not find default paragraph style")
|
||||
standard_style.setPropertyValue("CharFontName", font_name)
|
||||
standard_style.setPropertyValue("CharFontNameAsian", font_name)
|
||||
standard_style.setPropertyValue("CharFontNameComplex", font_name)
|
||||
if font_size is not None:
|
||||
standard_style.setPropertyValue("CharHeight", float(font_size))
|
||||
standard_style.setPropertyValue("CharHeightAsian", float(font_size))
|
||||
standard_style.setPropertyValue("CharHeightComplex", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharFontName", font_name)
|
||||
cls.cursor.setPropertyValue("CharFontNameAsian", font_name)
|
||||
cls.cursor.setPropertyValue("CharFontNameComplex", font_name)
|
||||
if font_size is not None:
|
||||
cls.cursor.setPropertyValue("CharHeight", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharHeightAsian", float(font_size))
|
||||
cls.cursor.setPropertyValue("CharHeightComplex", float(font_size))
|
||||
cls.ret = f"Default font set to '{font_name}'" + (f" with size {font_size}pt" if font_size else "")
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error setting default font: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def add_page_numbers(cls, position, start_number=1, format=None):
|
||||
"""
|
||||
Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int, optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message
|
||||
"""
|
||||
try:
|
||||
page_styles = cls.doc.StyleFamilies.getByName("PageStyles")
|
||||
default_style = page_styles.getByName("Standard")
|
||||
try:
|
||||
default_style.setPropertyValue("PageNumberOffset", start_number)
|
||||
except:
|
||||
pass
|
||||
if position.startswith("top"):
|
||||
default_style.HeaderIsOn = True
|
||||
target = default_style.HeaderText
|
||||
else:
|
||||
default_style.FooterIsOn = True
|
||||
target = default_style.FooterText
|
||||
cursor = target.createTextCursor()
|
||||
cursor.gotoStart(False)
|
||||
cursor.gotoEnd(True)
|
||||
cursor.setString("")
|
||||
cursor.gotoStart(False)
|
||||
if position.endswith("_left"):
|
||||
cursor.ParaAdjust = LEFT
|
||||
elif position.endswith("_center"):
|
||||
cursor.ParaAdjust = CENTER
|
||||
elif position.endswith("_right"):
|
||||
cursor.ParaAdjust = RIGHT
|
||||
if not format or format == "1":
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
elif format == "Page 1" or "Page" in format and "of" not in format:
|
||||
target.insertString(cursor, "Page ", False)
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
elif format == "1 of N" or format == "Page {page} of {total}" or "of" in format:
|
||||
if "Page" in format:
|
||||
target.insertString(cursor, "Page ", False)
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
target.insertString(cursor, " of ", False)
|
||||
page_count = cls.doc.createInstance("com.sun.star.text.TextField.PageCount")
|
||||
page_count.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_count, False)
|
||||
else:
|
||||
page_number = cls.doc.createInstance("com.sun.star.text.TextField.PageNumber")
|
||||
page_number.NumberingType = 4
|
||||
target.insertTextContent(cursor, page_number, False)
|
||||
cls.ret = "Successfully added page numbers"
|
||||
return cls.ret
|
||||
except Exception as e:
|
||||
cls.ret = f"Error adding page numbers: {str(e)}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def insert_page_break(cls, position="at_cursor"):
|
||||
"""
|
||||
Inserts a page break at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Where to insert the page break: 'at_cursor' for current cursor position,
|
||||
'end_of_document' for end of document. Defaults to 'at_cursor'.
|
||||
"""
|
||||
try:
|
||||
if position == "end_of_document":
|
||||
cls.cursor.gotoEnd(False)
|
||||
cls.text.insertControlCharacter(cls.cursor, PARAGRAPH_BREAK, False)
|
||||
cls.cursor.gotoStartOfParagraph(True)
|
||||
cls.cursor.BreakType = uno.Enum("com.sun.star.style.BreakType", "PAGE_BEFORE")
|
||||
cls.ret = "Page break inserted successfully"
|
||||
return True
|
||||
except Exception as e:
|
||||
cls.ret = f"Error inserting page break: {str(e)}"
|
||||
return False
|
||||
233
mm_agents/autoglm_v/tools/package/vlc.py
Normal file
233
mm_agents/autoglm_v/tools/package/vlc.py
Normal file
@@ -0,0 +1,233 @@
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import xml.etree.ElementTree as ET
|
||||
from pathlib import Path
|
||||
from urllib.parse import quote
|
||||
|
||||
import requests
|
||||
from requests.auth import HTTPBasicAuth
|
||||
|
||||
|
||||
class VLCTools:
|
||||
host = "localhost"
|
||||
port = 8080
|
||||
base_url = f"http://{host}:{port}/requests"
|
||||
password = "password"
|
||||
auth = HTTPBasicAuth("", password)
|
||||
ret = ""
|
||||
|
||||
@classmethod
|
||||
def print_result(cls):
|
||||
print(cls.ret)
|
||||
|
||||
@classmethod
|
||||
def _make_request(cls, endpoint, params=None):
|
||||
url = f"{cls.base_url}/{endpoint}"
|
||||
try:
|
||||
response = requests.get(url, params=params, auth=cls.auth)
|
||||
response.raise_for_status()
|
||||
return response
|
||||
except requests.exceptions.RequestException as e:
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def _get_status(cls):
|
||||
response = cls._make_request("status.xml")
|
||||
if response:
|
||||
return ET.fromstring(response.content)
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def env_info(cls):
|
||||
cls.ret = "None"
|
||||
|
||||
@classmethod
|
||||
def get_playlist(cls):
|
||||
response = cls._make_request("playlist.xml")
|
||||
if response:
|
||||
info = ET.fromstring(response.content)
|
||||
playlist_node = info.find('.//node[@name="Playlist"]')
|
||||
if playlist_node is not None:
|
||||
playlist_items = []
|
||||
for leaf in playlist_node.findall("leaf"):
|
||||
item = {"name": leaf.get("name"), "uri": leaf.get("uri"), "duration": leaf.get("duration") + "s"}
|
||||
playlist_items.append(item)
|
||||
cls.ret = f"Playlist: {playlist_items}"
|
||||
return cls.ret
|
||||
cls.ret = "Error getting playlist"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def play(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_play"})
|
||||
if response:
|
||||
cls.ret = "Start playing the media"
|
||||
return cls.ret
|
||||
cls.ret = "Error playing the media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def pause(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_pause"})
|
||||
if response:
|
||||
cls.ret = "Pause the media"
|
||||
return cls.ret
|
||||
cls.ret = "Error pausing the media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def next(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_next"})
|
||||
if response:
|
||||
cls.ret = "Switch to next media"
|
||||
return cls.ret
|
||||
cls.ret = "Error switching to next media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def previous(cls):
|
||||
response = cls._make_request("status.xml", {"command": "pl_previous"})
|
||||
if response:
|
||||
cls.ret = "Switch to previous media"
|
||||
return cls.ret
|
||||
cls.ret = "Error switching to previous media"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def add_to_playlist(cls, uri):
|
||||
if uri.startswith("http"):
|
||||
encoded_uri = uri
|
||||
else:
|
||||
encoded_uri = "file://" + quote(uri.replace("file://", ""))
|
||||
|
||||
response = cls._make_request("status.xml", {"command": "in_play", "input": encoded_uri})
|
||||
if response:
|
||||
cls.ret = f"Add {uri} to playlist"
|
||||
return cls.ret
|
||||
cls.ret = f"Error adding {uri} to playlist"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_current_time(cls):
|
||||
status = cls._get_status()
|
||||
if status is not None:
|
||||
time = status.find("time")
|
||||
cls.ret = int(time.text) if time is not None else None
|
||||
return cls.ret
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_media_duration(cls):
|
||||
status = cls._get_status()
|
||||
if status is not None:
|
||||
length = status.find("length")
|
||||
if length is not None:
|
||||
cls.ret = f"Media duration: {length.text} seconds"
|
||||
return cls.ret
|
||||
cls.ret = "Error getting media duration"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_settings(cls):
|
||||
settings = {}
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "r") as f:
|
||||
for line in f:
|
||||
if line:
|
||||
try:
|
||||
key, value = line.split("=")
|
||||
if key.strip().startswith("#"):
|
||||
continue
|
||||
settings[key.strip()] = value.strip()
|
||||
except:
|
||||
continue
|
||||
cls.ret = json.dumps(settings, indent=4, ensure_ascii=False)
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def set_settings(cls, field, value):
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "r") as rf:
|
||||
settings = rf.read()
|
||||
|
||||
# 正则表达式匹配settings中的field项并替换
|
||||
pattern = re.compile(r"#? *" + re.escape(field) + r"=.*")
|
||||
# 判断是否存在field项
|
||||
if pattern.search(settings):
|
||||
settings = pattern.sub(f"{field}={value}", settings)
|
||||
else:
|
||||
settings += f"{field}={value}\n"
|
||||
|
||||
with open(Path.home() / ".config/vlc/vlcrc", "w") as wf:
|
||||
wf.write(settings)
|
||||
|
||||
cls.ret = f"Set {field} to {value}"
|
||||
return cls.ret
|
||||
|
||||
@classmethod
|
||||
def toggle_fullscreen(cls, enable=None):
|
||||
"""
|
||||
Toggle fullscreen mode or set it explicitly based on the enable parameter.
|
||||
|
||||
Args:
|
||||
enable (bool, optional): If provided, explicitly set fullscreen mode (True for fullscreen, False for windowed)
|
||||
|
||||
Returns:
|
||||
str: Success or error message
|
||||
"""
|
||||
if enable is not None:
|
||||
command = "fullscreen" if enable else "fullscreen off"
|
||||
else:
|
||||
command = "fullscreen"
|
||||
response = cls._make_request("status.xml", {"command": command})
|
||||
if response:
|
||||
action = "enabled" if enable is True else "disabled" if enable is False else "toggled"
|
||||
cls.ret = f"Fullscreen mode {action}"
|
||||
return cls.ret
|
||||
cls.ret = "Error changing fullscreen mode"
|
||||
return None
|
||||
|
||||
@classmethod
|
||||
def get_media_files(cls, path, suffix=None):
|
||||
"""
|
||||
Gets the media files for the specified path.
|
||||
|
||||
Args:
|
||||
path (str): The path to the media files
|
||||
suffix (List[str], optional): The suffix of the media files.
|
||||
Defaults to ['mp4', 'avi', 'mkv', 'mov', 'mp3', 'm4a', 'wav']
|
||||
"""
|
||||
# Set default suffix if not provided
|
||||
if suffix is None:
|
||||
suffix = ["mp4", "avi", "mkv", "mov", "mp3", "m4a", "wav"]
|
||||
|
||||
# Validate path
|
||||
if not path:
|
||||
cls.ret = "Path cannot be empty"
|
||||
return None
|
||||
|
||||
if not os.path.exists(path):
|
||||
cls.ret = f"Path not found: {path}"
|
||||
return None
|
||||
|
||||
# Initialize result list
|
||||
media_files = []
|
||||
|
||||
# Convert suffix list to lowercase for case-insensitive comparison
|
||||
suffix = [s.lower() for s in suffix]
|
||||
|
||||
# Walk through directory
|
||||
try:
|
||||
for root, _, files in os.walk(path):
|
||||
for file in files:
|
||||
# Check if file extension matches any of the specified suffixes
|
||||
if any(file.lower().endswith(f".{s}") for s in suffix):
|
||||
# Add full path of the file to results
|
||||
full_path = os.path.join(root, file)
|
||||
media_files.append(full_path)
|
||||
|
||||
except Exception as e:
|
||||
cls.ret = f"Error while scanning directory: {str(e)}"
|
||||
return None
|
||||
|
||||
cls.ret = media_files
|
||||
return cls.ret
|
||||
70
mm_agents/aworldguiagent/README.md
Normal file
70
mm_agents/aworldguiagent/README.md
Normal file
@@ -0,0 +1,70 @@
|
||||
# aworldGUIAgent-v1
|
||||
|
||||
aworldGUIAgent-v1 built on the [AWorld Framework](https://github.com/inclusionAI/AWorld), specifically designed to tackle complex desktop automation tasks within the [OSWorld-verified](https://os-world.github.io/) benchmark.
|
||||
|
||||
The core logic for our agent's perception and reasoning is adapted from the great work of the [Agent-S project](https://github.com/simular-ai/Agent-S). We have built upon their foundation by introducing a suite of new executable tools that enhance the agent's ability to interact with the OS environment.
|
||||
|
||||
## Quick Start
|
||||
|
||||
Follow these steps to set up the environment and reproduce our results.
|
||||
|
||||
1. **Create Environment & Set Up OSWorld**:
|
||||
* First, create a dedicated Conda environment with **Python 3.11**.
|
||||
```bash
|
||||
conda create -n osworld_env python=3.11
|
||||
conda activate osworld_env
|
||||
```
|
||||
* Next, follow the official setup guide in the [OSWorld README](https://github.com/xlang-ai/OSWorld) to install OSWorld and its dependencies.
|
||||
|
||||
2. **Install AWorld Framework**:
|
||||
* Install the specific version of the AWorld Framework into the **same environment**.
|
||||
```bash
|
||||
# Make sure your osworld_env is still activated
|
||||
git clone https://github.com/inclusionAI/AWorld.git
|
||||
cd AWorld
|
||||
git checkout osworld_benchmark
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
3. **Run the Evaluation Script**:
|
||||
* Our results were achieved using `openai/o3` for reasoning and `bytedance/ui-tars-1.5-7b` for visual grounding, both accessed via OpenRouter.
|
||||
* Remember to replace placeholders like `YOUR_OPENROUTER_API_KEY` and `/path/to/your/vm/Ubuntu.vmx` with your actual credentials and paths.
|
||||
|
||||
```bash
|
||||
# Activate your OSWorld conda environment (e.g., osworld_env)
|
||||
conda activate osworld_env
|
||||
|
||||
# Run the evaluation with the recommended settings
|
||||
python run_multienv_aworldguiagent.py \
|
||||
--headless \
|
||||
--ground_url YOUR_BASE_URL \
|
||||
--ground_api_key YOUR_API_KEY \
|
||||
--ground_model bytedance/ui-tars-1.5-7b \
|
||||
--ground_provider open_router \
|
||||
--model_url YOUR_BASE_URL \
|
||||
--model_api_key YOUR_API_KEY \
|
||||
--model_temperature 1.0 \
|
||||
--provider_name vmware \
|
||||
--path_to_vm /path/to/your/vm/Ubuntu.vmx \
|
||||
--max_steps 50 \
|
||||
--model_provider open_router \
|
||||
--model openai/o3 \
|
||||
--grounding_width 1920 \
|
||||
--grounding_height 1080 \
|
||||
--test_all_meta_path evaluation_examples/test_all.json \
|
||||
--result_dir ./results \
|
||||
--observation_type screenshot \
|
||||
--num_envs 1 \
|
||||
--region us-east-1 \
|
||||
--client_password osworld-public-evaluation
|
||||
```
|
||||
|
||||
## Acknowledgements
|
||||
|
||||
This work would not have been possible without building upon the foundations of several incredible open-source projects.
|
||||
|
||||
- **AWorld Framework**: We thank the developers of the [AWorld Framework](https://github.com/inclusionAI/AWorld) for providing a powerful and flexible platform for agent development. The AWorld Framework is designed for agent training and is especially suited for complex multi-agent scenarios. If you have requirements for designing or experimenting with multi-agent systems, we highly recommend you explore the AWorld Framework further.
|
||||
|
||||
- **Agent-S**: We extend our sincere gratitude to the creators of the [Agent-S project](https://github.com/simular-ai/Agent-S). The core agent logic in our implementation is adapted and enhanced from their codebase. We built upon their work by adding a suite of executable tools to improve the agent's interaction with the OS environment, which effectively boosted the stability and capability of our CUA Agent.
|
||||
|
||||
- **OSWorld Benchmark**: We are grateful to the creators of the [OSWorld Benchmark](https://os-world.github.io/) for developing a challenging and comprehensive testbed for GUI agents.
|
||||
99
mm_agents/aworldguiagent/agent.py
Normal file
99
mm_agents/aworldguiagent/agent.py
Normal file
@@ -0,0 +1,99 @@
|
||||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import logging
|
||||
import platform
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from mm_agents.aworldguiagent.grounding import ACI
|
||||
from mm_agents.aworldguiagent.workflow import Worker
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class UIAgent:
|
||||
"""Base class for UI automation agents"""
|
||||
|
||||
""""""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = platform.system().lower(),
|
||||
):
|
||||
"""Initialize UIAgent
|
||||
|
||||
Args:
|
||||
engine_params: Configuration parameters for the LLM engine
|
||||
grounding_agent: Instance of ACI class for UI interaction
|
||||
platform: Operating system platform (macos, linux, windows)
|
||||
"""
|
||||
self.engine_params = engine_params
|
||||
self.grounding_agent = grounding_agent
|
||||
self.platform = platform
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset agent state"""
|
||||
pass
|
||||
|
||||
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
|
||||
"""Generate next action prediction
|
||||
|
||||
Args:
|
||||
instruction: Natural language instruction
|
||||
observation: Current UI state observation
|
||||
|
||||
Returns:
|
||||
Tuple containing agent info dictionary and list of actions
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
class AworldGUIAgent(UIAgent):
|
||||
"""Agent that uses no hierarchy for less inference time"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = platform.system().lower(),
|
||||
max_trajectory_length: int = 8,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""Initialize a minimalist AgentS2 without hierarchy
|
||||
|
||||
Args:
|
||||
engine_params: Configuration parameters for the LLM engine
|
||||
grounding_agent: Instance of ACI class for UI interaction
|
||||
platform: Operating system platform (darwin, linux, windows)
|
||||
max_trajectory_length: Maximum number of image turns to keep
|
||||
enable_reflection: Creates a reflection agent to assist the worker agent
|
||||
"""
|
||||
|
||||
super().__init__(engine_params, grounding_agent, platform)
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.enable_reflection = enable_reflection
|
||||
self.reset()
|
||||
|
||||
def reset(self) -> None:
|
||||
"""Reset agent state and initialize components"""
|
||||
self.executor = Worker(
|
||||
engine_params=self.engine_params,
|
||||
grounding_agent=self.grounding_agent,
|
||||
platform=self.platform,
|
||||
max_trajectory_length=self.max_trajectory_length,
|
||||
enable_reflection=self.enable_reflection,
|
||||
)
|
||||
|
||||
def predict(self, instruction: str, observation: Dict) -> Tuple[Dict, List[str]]:
|
||||
# Initialize the three info dictionaries
|
||||
executor_info, actions = self.executor.generate_next_action(
|
||||
instruction=instruction, obs=observation
|
||||
)
|
||||
|
||||
# concatenate the three info dictionaries
|
||||
info = {**{k: v for d in [executor_info or {}] for k, v in d.items()}}
|
||||
|
||||
return info, actions
|
||||
5252
mm_agents/aworldguiagent/grounding.py
Normal file
5252
mm_agents/aworldguiagent/grounding.py
Normal file
File diff suppressed because it is too large
Load Diff
947
mm_agents/aworldguiagent/prompt.py
Normal file
947
mm_agents/aworldguiagent/prompt.py
Normal file
@@ -0,0 +1,947 @@
|
||||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
GENERATOR_SYS_PROMPT = """You are an expert in graphical user interfaces and Python code. You are responsible for executing the task: `TASK_DESCRIPTION`.
|
||||
You are working in Ubuntu.
|
||||
You are provided with:
|
||||
1. A screenshot of the current time step.
|
||||
2. The history of your previous interactions with the UI.
|
||||
3. Access to the following class and methods to interact with the UI:
|
||||
class Agent:
|
||||
|
||||
def click(self, element_description: str, num_clicks: int = 1, button_type: str = 'left', hold_keys: List = []):
|
||||
'''Click on the element
|
||||
Args:
|
||||
element_description:str, a detailed descriptions of which element to click on. This description should be at least a full sentence.
|
||||
num_clicks:int, number of times to click the element
|
||||
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
||||
hold_keys:List, list of keys to hold while clicking
|
||||
'''
|
||||
|
||||
def done(self, return_value: Union[Dict, str, List, Tuple, int, float, bool, NoneType] = None):
|
||||
'''End the current task with a success and the required return value'''
|
||||
|
||||
def drag_and_drop(self, starting_description: str, ending_description: str, hold_keys: List = []):
|
||||
'''Drag from the starting description to the ending description
|
||||
Args:
|
||||
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
|
||||
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
|
||||
hold_keys:List list of keys to hold while dragging
|
||||
'''
|
||||
|
||||
def fail(self):
|
||||
'''End the current task with a failure, and replan the whole task.'''
|
||||
|
||||
def hold_and_press(self, hold_keys: List, press_keys: List):
|
||||
'''Hold a list of keys and press a list of keys
|
||||
Args:
|
||||
hold_keys:List, list of keys to hold
|
||||
press_keys:List, list of keys to press in a sequence
|
||||
'''
|
||||
|
||||
def hotkey(self, keys: List):
|
||||
'''Press a hotkey combination
|
||||
Args:
|
||||
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'])
|
||||
'''
|
||||
|
||||
def open(self, app_or_filename: str):
|
||||
'''Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
|
||||
Args:
|
||||
app_or_filename:str, the name of the application or filename to open
|
||||
'''
|
||||
|
||||
def save_to_knowledge(self, text: List[str]):
|
||||
'''Save facts, elements, texts, etc. to a long-term knowledge bank for reuse during this task. Can be used for copy-pasting text, saving elements, etc.
|
||||
Args:
|
||||
text:List[str] the text to save to the knowledge
|
||||
'''
|
||||
|
||||
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
||||
'''Scroll the element in the specified direction
|
||||
Args:
|
||||
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
||||
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
||||
shift:bool, whether to use shift+scroll for horizontal scrolling
|
||||
'''
|
||||
|
||||
def set_cell_values(self, cell_values: Dict[str, Any], app_name: str, sheet_name: str):
|
||||
'''Use this to set individual cell values in a spreadsheet. For example, setting A2 to "hello" would be done by passing {"A2": "hello"} as cell_values. The sheet must be opened before this command can be used.
|
||||
Args:
|
||||
cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format "A1", "B2", etc.
|
||||
Supported value types include: float, int, string, bool, formulas.
|
||||
app_name: str, The name of the spreadsheet application. For example, "Some_sheet.xlsx".
|
||||
sheet_name: str, The name of the sheet in the spreadsheet. For example, "Sheet1".
|
||||
'''
|
||||
|
||||
def switch_applications(self, app_code):
|
||||
'''Switch to a different application that is already open
|
||||
Args:
|
||||
app_code:str the code name of the application to switch to from the provided list of open applications
|
||||
'''
|
||||
|
||||
def type(self, element_description: str, text: str = '', overwrite: bool = False, enter: bool = False):
|
||||
'''Type text into a specific element
|
||||
Args:
|
||||
element_description:str, a detailed description of which element to enter text in. This description should be at least a full sentence.
|
||||
text:str, the text to type
|
||||
overwrite:bool, Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.
|
||||
enter:bool, Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
|
||||
'''
|
||||
|
||||
def wait(self, time: float):
|
||||
'''Wait for a specified amount of time
|
||||
Args:
|
||||
time:float the amount of time to wait in seconds
|
||||
'''
|
||||
|
||||
def code_launch_vscode(self, path):
|
||||
'''Launches Visual Studio Code with the specified file path or directory.
|
||||
在存在的窗口中打开一个文件或目录。
|
||||
|
||||
Args:
|
||||
path (str): 文件路径或目录。'''
|
||||
|
||||
def code_compare_files(self, file1, file2):
|
||||
'''Compares two files in VSCode.
|
||||
在VSCode中比较两个文件。
|
||||
|
||||
Args:
|
||||
file1 (str): 第一个文件的路径。
|
||||
file2 (str): 第二个文件的路径。'''
|
||||
|
||||
def code_add_folder(self, folder):
|
||||
'''Adds a folder to the last active window in VSCode.
|
||||
向VSCode的最后一个活动窗口添加文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。'''
|
||||
|
||||
def code_goto_file(self, file_path, line=1, character=1):
|
||||
'''Opens a file at a specific line and character position.
|
||||
在特定行和字符的位置打开文件。
|
||||
|
||||
Args:
|
||||
file_path (str): 文件路径。
|
||||
line (int): 行号。
|
||||
character (int): 字符位置。'''
|
||||
|
||||
def code_perform_merge(self, path1, path2, base, result):
|
||||
'''Perform a three-way merge.
|
||||
执行三方合并。
|
||||
|
||||
Args:
|
||||
path1 (str): 第一版本文件路径。
|
||||
path2 (str): 第二版本文件路径。
|
||||
base (str): 基础版本文件路径。
|
||||
result (str): 结果文件的保存路径。'''
|
||||
|
||||
def code_remove_folder(self, folder):
|
||||
'''Removes a folder from the last active window in VSCode.
|
||||
在VSCode的最后一个活动窗口中移除文件夹。
|
||||
|
||||
Args:
|
||||
folder (str): 文件夹路径。'''
|
||||
|
||||
def code_install_extension(self, extension_id, pre_release=False):
|
||||
'''Installs an extension or updates it in VSCode.
|
||||
安装或更新VSCode中的扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。
|
||||
pre_release (bool): 是否安装预发布版本。'''
|
||||
|
||||
def code_uninstall_extension(self, extension_id):
|
||||
'''Uninstalls an extension from VSCode.
|
||||
从VSCode中卸载扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。'''
|
||||
|
||||
def code_list_extensions(self, show_versions=False, category=None):
|
||||
'''Lists installed extensions in VSCode.
|
||||
列出VSCode中安装的扩展。
|
||||
|
||||
Args:
|
||||
show_versions (bool): 是否显示扩展的版本。
|
||||
category (str): 按类别筛选扩展。'''
|
||||
|
||||
def code_update_extensions(self):
|
||||
'''Updates all installed extensions in VSCode to the latest version.
|
||||
更新VSCode中所有安装的扩展到最新版本。'''
|
||||
|
||||
def code_disable_extension(self, extension_id):
|
||||
'''Disables a specific extension for the next instance of VSCode.
|
||||
禁用在下一个VSCode窗口中的指定扩展。
|
||||
|
||||
Args:
|
||||
extension_id (str): 扩展的标识符。'''
|
||||
|
||||
def code_toggle_sync(self, state):
|
||||
'''Toggles synchronization on or off in VSCode.
|
||||
在VSCode中开启或关闭同步。
|
||||
|
||||
Args:
|
||||
state (str): 'on' 或 'off' 表示开启或关闭。'''
|
||||
|
||||
|
||||
def libreoffice_calc_save(self):
|
||||
'''Save the current workbook to its current location
|
||||
|
||||
Returns:
|
||||
bool: True if save successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_get_workbook_info(self):
|
||||
'''Get workbook information
|
||||
|
||||
Args:
|
||||
None
|
||||
|
||||
Returns:
|
||||
dict: Workbook information, including file path, file name, sheets and active sheet'''
|
||||
|
||||
def libreoffice_calc_get_column_data(self, column_name):
|
||||
'''Get data from the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): Name of the column to read
|
||||
|
||||
Returns:
|
||||
list: List of values in the specified column'''
|
||||
|
||||
def libreoffice_calc_set_column_as_text(self, column_name):
|
||||
|
||||
'''
|
||||
Set the specified column format as text type.
|
||||
This will convert all numeric values in the column to text format and apply text formatting.
|
||||
|
||||
Args:
|
||||
column_name (str): The column name to format as text (e.g., 'A', 'B', 'C')
|
||||
|
||||
Returns:
|
||||
str: Success message or error description
|
||||
|
||||
Example:
|
||||
"Successfully set column A as text format"
|
||||
'''
|
||||
|
||||
def libreoffice_calc_get_active_sheet_data(self):
|
||||
|
||||
'''
|
||||
Get all data from the currently active sheet with detailed coordinate information.
|
||||
Returns data with cell addresses, values, row/column info, and empty cell indicators.
|
||||
|
||||
Returns:
|
||||
dict: Complete sheet data with detailed cell information
|
||||
|
||||
Example:
|
||||
{
|
||||
"data": [
|
||||
[
|
||||
{"address": "A1", "value": "", "row": 1, "col": 1, "col_name": "A", "is_empty": true},
|
||||
{"address": "B1", "value": "Age", "row": 1, "col": 2, "col_name": "B", "is_empty": false}
|
||||
],
|
||||
[
|
||||
{"address": "A2", "value": "Ryan", "row": 2, "col": 1, "col_name": "A", "is_empty": false},
|
||||
{"address": "B2", "value": 5.0, "row": 2, "col": 2, "col_name": "B", "is_empty": false}
|
||||
],
|
||||
[
|
||||
{"address": "A3", "value": "Jack", "row": 3, "col": 1, "col_name": "A", "is_empty": false},
|
||||
{"address": "B3", "value": 6.0, "row": 3, "col": 2, "col_name": "B", "is_empty": false}
|
||||
]
|
||||
],
|
||||
"rows": 3,
|
||||
"columns": 2,
|
||||
"range": "A1:B3"
|
||||
}
|
||||
'''
|
||||
|
||||
def libreoffice_calc_switch_active_sheet(self, sheet_name):
|
||||
'''Switch to the specified sheet and make it active, create if not exist
|
||||
|
||||
Args:
|
||||
sheet_name (str): Name of the sheet to switch to or create
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_column_values(self, column_name, data, start_index=2):
|
||||
'''Set data to the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): Name of the column to write
|
||||
data (list): List of values to write to the column
|
||||
start_index (int): The index of the first row to write to, default is 2 (skip the first row)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_highlight_range(self, range_str, color=0xFF0000):
|
||||
'''highlight the specified range with the specified color
|
||||
|
||||
Args:
|
||||
range_str (str): Range to highlight, in the format of "A1:B10"
|
||||
color (str): Color to highlight with, default is '0xFF0000' (red)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_transpose_range(self, source_range, target_cell):
|
||||
'''Transpose the specified range and paste it to the target cell
|
||||
|
||||
Args:
|
||||
source_range (str): Range to transpose, in the format of "A1:B10"
|
||||
target_cell (str): Target cell to paste the transposed data, in the format of "A1"
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_export_to_csv(self):
|
||||
'''Export the current document to a CSV file
|
||||
|
||||
Args:
|
||||
None
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_sort_column(self, column_name, ascending=True, start_index=2):
|
||||
'''Sorts the data in the specified column in ascending or descending order
|
||||
|
||||
Args:
|
||||
column_name (str): The name of the column to sort (e.g. 'A') or the title
|
||||
ascending (bool): Whether to sort in ascending order (default True)
|
||||
start_index (int): The index of the first row to sort, default is 1
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_validation_list(self, column_name, values):
|
||||
'''Set a validation list for the specified column
|
||||
|
||||
Args:
|
||||
column_name (str): The name of the column to set the validation list for
|
||||
values (list): The list of values to use for the validation list
|
||||
|
||||
Returns:
|
||||
None'''
|
||||
|
||||
def libreoffice_calc_hide_row_data(self, value="N/A"):
|
||||
'''Hide rows that contain the specified value
|
||||
|
||||
Args:
|
||||
value (str): The value to hide rows for, default is 'N/A'
|
||||
|
||||
Returns:
|
||||
None'''
|
||||
|
||||
def libreoffice_calc_reorder_columns(self, column_order):
|
||||
'''Reorder the columns in the sheet according to the specified order
|
||||
|
||||
Args:
|
||||
column_order (list): A list of column names in the desired order
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_create_pivot_table(self,
|
||||
source_sheet,
|
||||
table_name,
|
||||
row_fields=None,
|
||||
col_fields=None,
|
||||
value_fields=None,
|
||||
aggregation_function="sum",
|
||||
target_cell="A1",
|
||||
):
|
||||
'''Create a pivot table in the active worksheet based on data from the active sheet.'''
|
||||
|
||||
def libreoffice_calc_merge_cells(sheet_name, range_str):
|
||||
'''Merges a specified range of cells within a specific worksheet.
|
||||
|
||||
This function connects to a running LibreOffice Calc instance,
|
||||
selects a worksheet by its name, and merges the cells defined
|
||||
by the given range string.
|
||||
|
||||
Args:
|
||||
sheet_name (str): The name of the worksheet where the cells will be
|
||||
merged, e.g., 'Sheet1' or 'Q4_Report'.
|
||||
range_str (str): The cell range to merge, specified in A1 notation,
|
||||
e.g., 'A1:B10'.
|
||||
|
||||
Returns:
|
||||
bool: True if the cells were successfully merged, False if an
|
||||
error occurred.
|
||||
'''
|
||||
|
||||
def libreoffice_calc_set_cell_value(self, cell, value):
|
||||
'''Set a value to a specific cell in the active worksheet.
|
||||
|
||||
Args:
|
||||
cell (str): Cell reference (e.g., 'A1')
|
||||
value (str): Value to set in the cell
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_format_range(self, range_str, background_color=None, font_color=None, bold=None, alignment=None):
|
||||
'''Apply formatting to the specified range in the active worksheet
|
||||
|
||||
Args:
|
||||
range_str (str): Range to format, in the format of 'A1:B10'
|
||||
background_color (str, optional): Background color in hex format (e.g., '#0000ff')
|
||||
font_color (str, optional): Font color in hex format (e.g., '#ffffff')
|
||||
bold (bool, optional): Whether to make the text bold
|
||||
italic (bool, optional): Whether to make the text italic
|
||||
alignment (str, optional): Text alignment (left, center, right)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_freeze_panes(self, rows=0, columns=0):
|
||||
'''冻结活动工作表中的行和/或列
|
||||
|
||||
Args:
|
||||
rows (int): 从顶部开始冻结的行数
|
||||
columns (int): 从左侧开始冻结的列数
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_rename_sheet(self, old_name, new_name):
|
||||
'''重命名工作表
|
||||
|
||||
Args:
|
||||
old_name (str): 要重命名的工作表的当前名称
|
||||
new_name (str): 工作表的新名称
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_copy_sheet(self, source_sheet, new_sheet_name=None):
|
||||
'''创建工作簿中现有工作表的副本
|
||||
|
||||
Args:
|
||||
source_sheet (str): 要复制的工作表名称
|
||||
new_sheet_name (str, optional): 新工作表副本的名称,如果不提供则自动生成
|
||||
|
||||
Returns:
|
||||
str: 新创建的工作表名称,如果失败则返回None'''
|
||||
|
||||
def libreoffice_calc_reorder_sheets(self, sheet_name, position):
|
||||
'''重新排序工作表在工作簿中的位置
|
||||
|
||||
Args:
|
||||
sheet_name (str): 要移动的工作表名称
|
||||
position (int): 要移动到的位置(基于0的索引)
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_set_chart_legend_position(self, position):
|
||||
'''Set the position of the legend in a chart in the active worksheet.
|
||||
|
||||
Args:
|
||||
position (str): Position of the legend ('top', 'bottom', 'left', 'right', 'none')
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_set_number_format(self, range_str, format_type, decimal_places=None):
|
||||
'''Apply a specific number format to a range of cells in the active worksheet.
|
||||
|
||||
Args:
|
||||
range_str (str): Range to format, in the format of 'A1:B10'
|
||||
format_type (str): Type of number format to apply
|
||||
decimal_places (int, optional): Number of decimal places to display
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_calc_adjust_column_width(self, columns, width=None, autofit=False):
|
||||
'''调整活动工作表中指定列的宽度
|
||||
|
||||
Args:
|
||||
columns (str): 要调整的列范围,例如 'A:C' 表示从A列到C列
|
||||
width (float, optional): 要设置的宽度(以字符为单位)
|
||||
autofit (bool, optional): 是否自动调整列宽以适应内容
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_adjust_row_height(self, rows, height=None, autofit=False):
|
||||
'''调整活动工作表中指定行的高度
|
||||
|
||||
Args:
|
||||
rows (str): 要调整的行范围,例如 '1:10' 表示第1行到第10行
|
||||
height (float, optional): 要设置的高度(以点为单位)
|
||||
autofit (bool, optional): 是否自动调整行高以适应内容
|
||||
|
||||
Returns:
|
||||
bool: 操作成功返回True,否则返回False'''
|
||||
|
||||
def libreoffice_calc_export_to_pdf(self, file_path=None, sheets=None, open_after_export=False):
|
||||
'''将当前文档或指定工作表导出为PDF文件
|
||||
|
||||
Args:
|
||||
file_path (str, optional): PDF文件保存路径,如果不指定则使用当前文档路径
|
||||
sheets (list, optional): 要包含在PDF中的工作表名称列表,如果不指定则包含所有工作表
|
||||
open_after_export (bool, optional): 导出后是否打开PDF文件
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
def libreoffice_calc_set_zoom_level(self, zoom_percentage):
|
||||
'''调整当前工作表的缩放级别,使单元格看起来更大或更小
|
||||
|
||||
Args:
|
||||
zoom_percentage (int): 缩放级别的百分比(例如,75表示75%,100表示正常大小,150表示放大)。
|
||||
有效范围通常为10-400。
|
||||
|
||||
Returns:
|
||||
bool: 成功返回True,失败返回False'''
|
||||
|
||||
|
||||
def libreoffice_impress_save(self):
|
||||
'''保存文档到当前位置'''
|
||||
|
||||
def libreoffice_impress_go_to_slide(self, slide_index):
|
||||
'''Navigates to a specific slide in the presentation based on its index.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to navigate to (1-based indexing)
|
||||
|
||||
Returns:
|
||||
bool: True if navigation was successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_get_slide_count(self):
|
||||
'''Gets the total number of slides in the current presentation.
|
||||
:return: The total number of slides as an integer'''
|
||||
|
||||
def libreoffice_impress_duplicate_slide(self, slide_index):
|
||||
'''Creates a duplicate of a specific slide and places it at the end of the presentation.
|
||||
|
||||
:param slide_index: The index of the slide to duplicate (1-based indexing)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_font(self, slide_index, font_name):
|
||||
'''Sets the font style for all text elements in a specific slide, including the title.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to modify (1-based indexing)
|
||||
font_name (str): The name of the font to apply (e.g., 'Arial', 'Times New Roman', 'Calibri')
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_write_text(self, content, page_index, box_index, bold=False, italic=False, size=None, append=False):
|
||||
'''Writes text to a specific textbox on a slide
|
||||
|
||||
:param content: The text content to add
|
||||
:param page_index: The index of the slide (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param bold: Whether to make the text bold, default is False
|
||||
:param italic: Whether to make the text italic, default is False
|
||||
:param size: The size of the text. If None, uses the box's current font size.
|
||||
:param append: Whether to append the text, default is False. If you want to observe some formats(like a bullet at the beginning) or keep the original text, you should set up it.
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_style(self, slide_index, box_index, bold=None, italic=None, underline=None):
|
||||
'''Sets the style properties for the specified textbox on a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param bold: Whether to make the text bold
|
||||
:param italic: Whether to make the text italic
|
||||
:param underline: Whether to underline the text
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_configure_auto_save(self, enabled, interval_minutes):
|
||||
'''Enables or disables auto-save functionality for the current document and sets the auto-save interval.
|
||||
|
||||
:param enabled: Whether to enable (True) or disable (False) auto-save
|
||||
:param interval_minutes: The interval in minutes between auto-saves (minimum 1 minute)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_background_color(self, slide_index, box_index, color):
|
||||
'''Sets the background color for the specified textbox on a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide containing the textbox (1-based indexing)
|
||||
box_index (int): The index of the textbox to modify (0-based indexing)
|
||||
color (str): The color to apply to the textbox (e.g., 'red', 'green', 'blue', 'yellow', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_text_color(self, slide_index, box_index, color):
|
||||
'''Sets the text color for the specified textbox on a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to modify (1-based indexing)
|
||||
box_index (int): The index of the textbox to modify (0-based indexing)
|
||||
color (str): The color to apply to the text (e.g., 'red', 'green', 'blue', 'black', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_delete_content(self, slide_index, box_index):
|
||||
'''Deletes the specified textbox from a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_orientation(self, orientation):
|
||||
'''Changes the orientation of slides in the presentation between portrait (upright) and landscape (sideways).
|
||||
|
||||
:param orientation: The desired orientation for the slides ('portrait' or 'landscape')
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_position_box(self, slide_index, box_index, position):
|
||||
'''Positions a textbox or image on a slide at a specific location or predefined position.
|
||||
|
||||
:param slide_index: The index of the slide containing the box (1-based indexing)
|
||||
:param box_index: The index of the box to position (0-based indexing)
|
||||
:param position: Predefined position on the slide (left, right, center, top, bottom, etc.)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_insert_file(self, file_path, slide_index=None, position=None, size=None, autoplay=False):
|
||||
'''Inserts a video file into the current or specified slide in the presentation.
|
||||
|
||||
Args:
|
||||
file_path (str): The full path to the video file to be inserted
|
||||
slide_index (int, optional): The index of the slide to insert the video into (1-based indexing).
|
||||
If not provided, inserts into the current slide.
|
||||
position (dict, optional): The position coordinates for the video as percentages of slide dimensions
|
||||
{'x': float, 'y': float}
|
||||
size (dict, optional): The size dimensions for the video as percentages of slide dimensions
|
||||
{'width': float, 'height': float}
|
||||
autoplay (bool, optional): Whether the video should automatically play when the slide is shown
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_background(self, slide_index=None, color=None, image_path=None):
|
||||
'''Sets the background color or image for a specific slide or all slides.
|
||||
|
||||
Args:
|
||||
slide_index (int, optional): The index of the slide to modify (1-based indexing).
|
||||
If not provided, applies to all slides.
|
||||
color (str, optional): The background color to apply (e.g., 'red', 'green', 'blue', or hex color code)
|
||||
image_path (str, optional): Path to an image file to use as background. If provided, overrides color.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_save_as(self, file_path, overwrite=False):
|
||||
'''Saves the current document to a specified location with a given filename.
|
||||
|
||||
:param file_path: The full path where the file should be saved, including the filename and extension
|
||||
:param overwrite: Whether to overwrite the file if it already exists (default: False)
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_insert_image(self, slide_index, image_path, width=None, height=None, position=None):
|
||||
'''Inserts an image to a specific slide in the presentation.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide to add the image to (1-based indexing)
|
||||
image_path (str): The full path to the image file to be added
|
||||
width (float, optional): The width of the image in centimeters
|
||||
height (float, optional): The height of the image in centimeters
|
||||
position (dict, optional): The position coordinates for the image as percentages
|
||||
{
|
||||
'x': float, # The x-coordinate as a percentage of slide width
|
||||
'y': float # The y-coordinate as a percentage of slide height
|
||||
}
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_configure_display_settings(self, use_presenter_view=None, primary_monitor_only=None, monitor_for_presentation=None
|
||||
):
|
||||
'''Configures the display settings for LibreOffice Impress presentations.
|
||||
|
||||
Args:
|
||||
use_presenter_view (bool, optional): Whether to use presenter view. Set to false to disable presenter view.
|
||||
primary_monitor_only (bool, optional): Whether to use only the primary monitor for the presentation.
|
||||
monitor_for_presentation (int, optional): Specify which monitor to use (1 for primary, 2 for secondary, etc.)
|
||||
|
||||
Returns:
|
||||
bool: True if settings were successfully applied, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_text_strikethrough(self, slide_index, box_index, line_numbers, apply):
|
||||
'''Applies or removes strike-through formatting to specific text content in a slide.
|
||||
|
||||
Args:
|
||||
slide_index (int): The index of the slide containing the text (1-based indexing)
|
||||
box_index (int): The index of the textbox containing the text (0-based indexing)
|
||||
line_numbers (list): The line numbers to apply strike-through formatting to (1-based indexing)
|
||||
apply (bool): Whether to apply (true) or remove (false) strike-through formatting
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_textbox_alignment(self, slide_index, box_index, alignment):
|
||||
'''Sets the text alignment for the specified textbox on a slide.
|
||||
|
||||
:param slide_index: The index of the slide to modify (1-based indexing)
|
||||
:param box_index: The index of the textbox to modify (0-based indexing)
|
||||
:param alignment: The text alignment to apply ('left', 'center', 'right', or 'justify')
|
||||
:return: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_set_slide_number_color(self, color):
|
||||
'''Sets the color of the slide number in the presentation.
|
||||
|
||||
Args:
|
||||
color (str): The color to apply to slide numbers (e.g., 'red', 'green', 'blue', 'black', or hex color code)
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_impress_export_to_image(self, file_path, format, slide_index=None):
|
||||
'''Exports the current presentation or a specific slide to an image file format.
|
||||
|
||||
Args:
|
||||
file_path (str): The full path where the image file should be saved, including the filename and extension
|
||||
format (str): The image format to export to (e.g., 'png', 'jpeg', 'gif')
|
||||
slide_index (int, optional): The index of the specific slide to export (1-based indexing).
|
||||
If not provided, exports the entire presentation as a series of images.
|
||||
|
||||
Returns:
|
||||
bool: True if export was successful, False otherwise'''
|
||||
|
||||
|
||||
def libreoffice_writer_save(self):
|
||||
'''保存文档到当前位置'''
|
||||
|
||||
def libreoffice_writer_write_text(self, text, bold=False, italic=False, size=None):
|
||||
'''写入文本'''
|
||||
|
||||
def libreoffice_writer_set_color(self, pattern, color, paragraph_indices=None):
|
||||
'''Changes the color of matched text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
pattern (str): Regular expression pattern to match text
|
||||
color (int): Hex color code (e.g., 0x000000 for black)
|
||||
paragraph_indices (list, optional): List of paragraph indices to modify (0-based).
|
||||
If None, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_find_and_replace(self, pattern, replacement, paragraph_indices=None):
|
||||
'''Finds all occurrences of a specified text pattern and replaces them with another text in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The pattern to match in the document, should be a regular expression
|
||||
replacement (str): The text to replace the found text with
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing)
|
||||
|
||||
Returns:
|
||||
str: Success message with number of replacements made'''
|
||||
|
||||
def libreoffice_writer_set_font(self, font_name, paragraph_indices=None):
|
||||
'''Changes the font of text in the document or specified paragraphs.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to apply (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_set_line_spacing(self, spacing_value, paragraph_indices=None):
|
||||
'''Sets the line spacing for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
spacing_value (float): The line spacing value to apply (1.0 for single spacing, 2.0 for double spacing, etc.)
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.'''
|
||||
|
||||
def libreoffice_writer_remove_highlighting(self, paragraph_indices=None):
|
||||
'''Removes ALL highlighting from text in the document for specified paragraphs.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_find_highlighted_text(self, highlight_color):
|
||||
'''Finds all text in the document that has a specific highlight color applied to it.
|
||||
|
||||
Args:
|
||||
highlight_color (str): The highlight color to search for. Can be a color name (e.g., 'yellow', 'green') or hex code.
|
||||
|
||||
Returns:
|
||||
list: A list of strings containing all text segments with the specified highlight color.'''
|
||||
|
||||
def libreoffice_writer_insert_formula_at_cursor(self, formula):
|
||||
'''Inserts a formula at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
formula (str): The formula to insert at the current cursor position.
|
||||
|
||||
Returns:
|
||||
bool: True if successful, False otherwise'''
|
||||
|
||||
def libreoffice_writer_insert_image_at_cursor(self, image_path, width=None, height=None):
|
||||
'''Inserts an image at the current cursor position in the document.
|
||||
|
||||
Args:
|
||||
image_path (str): Full path to the image file to insert
|
||||
width (int, optional): Width to display the image in pixels
|
||||
height (int, optional): Height to display the image in pixels
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_set_strikethrough(self, pattern, paragraph_indices=None):
|
||||
'''Sets the strikethrough formatting for text matching the specified pattern in the document.
|
||||
|
||||
Args:
|
||||
pattern (str): The regular expression pattern to match in the document
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error information'''
|
||||
|
||||
def libreoffice_writer_set_font_size(self, font_size, pattern, paragraph_indices=None):
|
||||
'''Changes the font size of specified text in the document.
|
||||
|
||||
Args:
|
||||
font_size (float): The font size to apply (in points).
|
||||
pattern (str): The pattern to match in the document, should be a regular expression.
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Result message indicating success or failure.'''
|
||||
|
||||
def libreoffice_writer_export_to_pdf(self, output_path=None, output_filename=None, include_comments=False, quality="standard"):
|
||||
'''Exports the current document to PDF format.
|
||||
|
||||
Args:
|
||||
output_path (str, optional): The full path where the PDF should be saved.
|
||||
If not provided, uses the same location as the original document.
|
||||
output_filename (str, optional): The filename to use for the PDF.
|
||||
If not provided, uses the original document's filename with .pdf extension.
|
||||
include_comments (bool, optional): Whether to include comments in the exported PDF.
|
||||
Defaults to False.
|
||||
quality (str, optional): The quality of the PDF export ('standard', 'high', 'print').
|
||||
Defaults to 'standard'.
|
||||
|
||||
Returns:
|
||||
str: Path to the exported PDF file or error message'''
|
||||
|
||||
def libreoffice_writer_set_paragraph_alignment(self, alignment, paragraph_indices=None):
|
||||
'''Sets the text alignment for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
alignment (str): The alignment to apply ('left', 'center', 'right', 'justify').
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_capitalize_words(self, paragraph_indices=None):
|
||||
'''Capitalizes the first letter of each word for specified paragraphs in the document.
|
||||
|
||||
Args:
|
||||
paragraph_indices (list, optional): Indices of paragraphs to modify (0-based indexing).
|
||||
If not provided, applies to all paragraphs.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_set_default_font(self, font_name, font_size=None):
|
||||
'''Sets the default font for new text in the document without changing existing text.
|
||||
|
||||
Args:
|
||||
font_name (str): The name of the font to set as default (e.g., 'Times New Roman', 'Arial', 'Calibri')
|
||||
font_size (float, optional): The default font size in points.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_add_page_numbers(self, position, start_number=1, format=None):
|
||||
'''Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int
|
||||
def libreoffice_writer_add_page_numbers(self, position, start_number=1, format=None):
|
||||
'''Adds page numbers to the document at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Position of the page numbers ('bottom_left', 'bottom_center', 'bottom_right',
|
||||
'top_left', 'top_center', 'top_right')
|
||||
start_number (int, optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message''', optional): The starting page number. Defaults to 1.
|
||||
format (str, optional): Format of the page numbers (e.g., '1', 'Page 1', '1 of N').
|
||||
Defaults to simple number format.
|
||||
|
||||
Returns:
|
||||
str: Success message or error message'''
|
||||
|
||||
def libreoffice_writer_insert_page_break(self, position="at_cursor"):
|
||||
'''Inserts a page break at the specified position.
|
||||
|
||||
Args:
|
||||
position (str): Where to insert the page break: 'at_cursor' for current cursor position,
|
||||
'end_of_document' for end of document. Defaults to 'at_cursor'.'''
|
||||
|
||||
Your response should be formatted like this:
|
||||
(Previous action verification)
|
||||
Carefully analyze based on the screenshot if the previous action was successful. If the previous action was not successful, provide a reason for the failure.
|
||||
|
||||
(Screenshot Analysis)
|
||||
Closely examine and describe the current state of the desktop along with the currently open applications.
|
||||
|
||||
(Next Action)
|
||||
Based on the current screenshot and the history of your previous interaction with the UI, decide on the next action in natural language to accomplish the given task.
|
||||
|
||||
(Grounded Action)
|
||||
Translate the next action into code using the provided API methods. Format the code like this:
|
||||
```python
|
||||
agent.click("The menu button at the top right of the window", 1, "left")
|
||||
```
|
||||
Note for the code:
|
||||
1. Only perform one action at a time.
|
||||
2. Do not put anything other than python code in the block. You can only use one function call at a time. Do not put more than one function call in the block.
|
||||
3. You must use only the available methods provided above to interact with the UI, do not invent new methods.
|
||||
4. Only return one code block every time. There must be a single line of code in the code block.
|
||||
5. Do not do anything other than the exact specified task. Return with `agent.done()` immediately after the subtask is completed or `agent.fail()` if it cannot be completed.
|
||||
6. Whenever possible, your grounded action should use hot-keys with the agent.hotkey() action instead of clicking or dragging.
|
||||
7. My computer's password is 'osworld-public-evaluation', feel free to use it when you need sudo rights.
|
||||
8. Before performing any calculations on elements in a table or inserting charts, always use libreoffice_calc_get_column_data or libreoffice_calc_get_active_sheet_data to obtain accurate column coordinates and element values from the table, ensuring precise execution of subsequent calculations or chart insertions.
|
||||
9. Generate agent.fail() as your grounded action if you get exhaustively stuck on the task and believe it is impossible.
|
||||
10. Generate agent.done() as your grounded action when your believe the task is fully complete.
|
||||
11. Do not use the "command" + "tab" hotkey on MacOS.
|
||||
"""
|
||||
|
||||
|
||||
REFLECTION_SYS_PROMPT = """
|
||||
You are an expert computer use agent designed to reflect on the trajectory of a task and provide feedback on what has happened so far.
|
||||
You have access to the Task Description and the Current Trajectory of another computer agent. The Current Trajectory is a sequence of a desktop image, chain-of-thought reasoning, and a desktop action for each time step. The last image is the screen's display after the last action.
|
||||
Your task is to generate a reflection. Your generated reflection must fall under one of the cases listed below:
|
||||
|
||||
**Your judgment must be based solely on a critical comparison between the agent's stated plan/reasoning and the visual evidence presented in the screenshot history.** Do not take the agent's claims of success at face value. **If there is no visual proof in the screenshot, the action did not happen.**
|
||||
|
||||
Case 1. The trajectory is not going according to plan. This occurs when there is a mismatch between the intended action and the visual outcome, when the agent hallucinates information, or when it is stuck. You must trigger Case 1 if you detect any of the following:
|
||||
Failed Action: The previous action did not produce its expected visual change on the screen (e.g., a window failed to open, text was not pasted).
|
||||
Unsupported Conclusion (Hallucination): The agent makes a claim or states a result (like a number or a fact) that is not visibly supported by the current or any previous screenshot. This is a critical failure.
|
||||
Repetitive Cycle: The agent is repeating actions without making meaningful progress.
|
||||
Case 2. The trajectory is going according to plan. In this case, simply tell the agent to continue proceeding as planned. DO NOT encourage a specific action in particular.
|
||||
Case 3. You believe the current task has been completed. In this case, tell the agent that the task has been successfully completed.
|
||||
|
||||
To be successful, you must follow the rules below:
|
||||
- **Your output MUST be based on one of the case options above**.
|
||||
- DO NOT suggest any specific future plans or actions. Your only goal is to provide a reflection, not an actual plan or action.
|
||||
- Any response that falls under Case 1 should explain why the trajectory is not going according to plan. You should especially lookout for cycles of actions that are continually repeated with no progress.
|
||||
- Any response that falls under Case 2 should be concise, since you just need to affirm the agent to continue with the current trajectory.
|
||||
"""
|
||||
194
mm_agents/aworldguiagent/utils.py
Normal file
194
mm_agents/aworldguiagent/utils.py
Normal file
@@ -0,0 +1,194 @@
|
||||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import re
|
||||
import base64
|
||||
from aworld.core.common import Observation, ActionModel
|
||||
from aworld.models.model_response import ModelResponse
|
||||
from aworld.core.agent.base import AgentResult
|
||||
from aworld.memory.main import InMemoryMemoryStore
|
||||
|
||||
def encode_image(image_content):
|
||||
# if image_content is a path to an image file, check type of the image_content to verify
|
||||
if isinstance(image_content, str):
|
||||
with open(image_content, "rb") as image_file:
|
||||
return base64.b64encode(image_file.read()).decode("utf-8")
|
||||
else:
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
|
||||
def extract_first_agent_function(code_string):
|
||||
# Regular expression pattern to match 'agent' functions with any arguments, including nested parentheses
|
||||
pattern = r'agent\.[a-zA-Z_]+\((?:[^()\'"]|\'[^\']*\'|"[^"]*")*\)'
|
||||
|
||||
# Find all matches in the string
|
||||
matches = re.findall(pattern, code_string)
|
||||
|
||||
# Return the first match if found, otherwise return None
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
def parse_single_code_from_string(input_string):
|
||||
input_string = input_string.strip()
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return input_string.strip()
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = [
|
||||
"WAIT",
|
||||
"DONE",
|
||||
"FAIL",
|
||||
] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
if len(codes) <= 0:
|
||||
return "fail"
|
||||
return codes[0]
|
||||
|
||||
|
||||
def sanitize_code(code):
|
||||
# This pattern captures the outermost double-quoted text
|
||||
if "\n" in code:
|
||||
pattern = r'(".*?")'
|
||||
# Find all matches in the text
|
||||
matches = re.findall(pattern, code, flags=re.DOTALL)
|
||||
if matches:
|
||||
# Replace the first occurrence only
|
||||
first_match = matches[0]
|
||||
code = code.replace(first_match, f'"""{first_match[1:-1]}"""', 1)
|
||||
return code
|
||||
|
||||
def prune_image_messages(memory_store: InMemoryMemoryStore, max_trajectory_length: int):
|
||||
"""
|
||||
检查 memory_store 中的消息,并仅保留最新的 max_trajectory_length 个包含图片的消息。
|
||||
对于更早的包含图片的消息,会从其 content 中移除图片部分。
|
||||
|
||||
Args:
|
||||
memory_store (InMemoryMemoryStore): 内存存储的对象实例。
|
||||
max_trajectory_length (int): 希望保留的含图片消息的最大数量。
|
||||
"""
|
||||
# 步骤 1: 使用 memory_store 的 get_all 方法获取所有消息
|
||||
all_items = memory_store.get_all()
|
||||
|
||||
# 步骤 2: 筛选出所有包含图片内容的消息
|
||||
image_messages = []
|
||||
for item in all_items:
|
||||
if isinstance(item.content, list):
|
||||
if any(isinstance(part, dict) and part.get('type') == 'image_url' for part in item.content):
|
||||
image_messages.append(item)
|
||||
|
||||
# 步骤 3: 检查包含图片的消息数量是否超过限制
|
||||
if len(image_messages) <= max_trajectory_length:
|
||||
print("Number of image messages does not exceed the limit. No pruning needed.")
|
||||
return
|
||||
|
||||
# 步骤 4: 确定需要移除图片的旧消息
|
||||
# 由于 get_all() 返回的列表是按添加顺序排列的,所以列表前面的项就是最旧的
|
||||
num_to_prune = len(image_messages) - max_trajectory_length
|
||||
messages_to_prune = image_messages[:num_to_prune]
|
||||
|
||||
print(f"Found {len(image_messages)} image messages. Pruning the oldest {num_to_prune}.")
|
||||
|
||||
# 步骤 5: 遍历需要修剪的消息,更新其 content,并使用 store 的 update 方法保存
|
||||
for item_to_prune in messages_to_prune:
|
||||
|
||||
# 创建一个新的 content 列表,仅包含非图片部分
|
||||
new_content = [
|
||||
part for part in item_to_prune.content
|
||||
if not (isinstance(part, dict) and part.get('type') == 'image_url')
|
||||
]
|
||||
|
||||
# 可选:如果 new_content 中只剩下一个文本元素,可以将其简化为字符串
|
||||
if len(new_content) == 1 and new_content[0].get('type') == 'text':
|
||||
final_content = new_content[0].get('text', '')
|
||||
else:
|
||||
final_content = new_content
|
||||
|
||||
# 更新消息对象的 content 属性
|
||||
item_to_prune.content = final_content
|
||||
|
||||
# 使用 memory_store 的 update 方法将更改持久化到 store 中
|
||||
memory_store.update(item_to_prune)
|
||||
|
||||
print(f"Pruned image from message with ID: {item_to_prune.id}")
|
||||
|
||||
def reps_action_result(resp: ModelResponse) -> AgentResult:
|
||||
try:
|
||||
full_response = resp.content
|
||||
# Extract thoughts section
|
||||
thoughts_match = re.search(
|
||||
r"<thoughts>(.*?)</thoughts>", full_response, re.DOTALL
|
||||
)
|
||||
thoughts = thoughts_match.group(1).strip()
|
||||
# Extract answer section
|
||||
answer_match = re.search(r"<answer>(.*?)</answer>", full_response, re.DOTALL)
|
||||
answer = answer_match.group(1).strip()
|
||||
action = ActionModel(action_name=answer, policy_info=thoughts)
|
||||
return AgentResult(actions=[action], current_state=None)
|
||||
except Exception as e:
|
||||
action = ActionModel(action_name=resp.content, policy_info="")
|
||||
return AgentResult(actions=[action], current_state=None)
|
||||
|
||||
def parse_single_code_from_string(input_string):
|
||||
input_string = input_string.strip()
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return input_string.strip()
|
||||
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
|
||||
codes = []
|
||||
|
||||
for match in matches:
|
||||
match = match.strip()
|
||||
commands = [
|
||||
"WAIT",
|
||||
"DONE",
|
||||
"FAIL",
|
||||
] # fixme: updates this part when we have more commands
|
||||
|
||||
if match in commands:
|
||||
codes.append(match.strip())
|
||||
elif match.split("\n")[-1] in commands:
|
||||
if len(match.split("\n")) > 1:
|
||||
codes.append("\n".join(match.split("\n")[:-1]))
|
||||
codes.append(match.split("\n")[-1])
|
||||
else:
|
||||
codes.append(match)
|
||||
|
||||
if len(codes) <= 0:
|
||||
return "fail"
|
||||
return codes[0]
|
||||
230
mm_agents/aworldguiagent/workflow.py
Normal file
230
mm_agents/aworldguiagent/workflow.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
This code is adapted from AgentS2 (https://github.com/simular-ai/Agent-S)
|
||||
with modifications to suit specific requirements.
|
||||
"""
|
||||
import logging
|
||||
import textwrap
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
from aworld.config.conf import AgentConfig
|
||||
from aworld.agents.llm_agent import Agent
|
||||
from aworld.core.common import Observation
|
||||
|
||||
from aworld.core.task import Task
|
||||
from aworld.core.context.base import Context
|
||||
from aworld.core.event.base import Message
|
||||
from aworld.models.llm import get_llm_model
|
||||
from aworld.utils.common import sync_exec
|
||||
|
||||
from mm_agents.aworldguiagent.grounding import ACI
|
||||
from mm_agents.aworldguiagent.prompt import GENERATOR_SYS_PROMPT, REFLECTION_SYS_PROMPT
|
||||
from mm_agents.aworldguiagent.utils import encode_image, extract_first_agent_function, parse_single_code_from_string, sanitize_code
|
||||
from mm_agents.aworldguiagent.utils import prune_image_messages, reps_action_result
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
class Worker:
|
||||
def __init__(
|
||||
self,
|
||||
engine_params: Dict,
|
||||
grounding_agent: ACI,
|
||||
platform: str = "ubuntu",
|
||||
max_trajectory_length: int = 16,
|
||||
enable_reflection: bool = True,
|
||||
):
|
||||
"""
|
||||
Worker receives the main task and generates actions, without the need of hierarchical planning
|
||||
Args:
|
||||
engine_params: Dict
|
||||
Parameters for the multimodal engine
|
||||
grounding_agent: Agent
|
||||
The grounding agent to use
|
||||
platform: str
|
||||
OS platform the agent runs on (darwin, linux, windows)
|
||||
max_trajectory_length: int
|
||||
The amount of images turns to keep
|
||||
enable_reflection: bool
|
||||
Whether to enable reflection
|
||||
"""
|
||||
# super().__init__(engine_params, platform)
|
||||
|
||||
self.grounding_agent = grounding_agent
|
||||
self.max_trajectory_length = max_trajectory_length
|
||||
self.enable_reflection = enable_reflection
|
||||
self.use_thinking = engine_params.get("model", "") in [
|
||||
"claude-3-7-sonnet-20250219"
|
||||
]
|
||||
|
||||
self.generator_agent_config = AgentConfig(
|
||||
llm_provider=engine_params.get("engine_type", "openai"),
|
||||
llm_model_name=engine_params.get("model", "openai/o3",),
|
||||
llm_temperature=engine_params.get("temperature", 1.0),
|
||||
llm_base_url=engine_params.get("base_url", "https://openrouter.ai/api/v1"),
|
||||
llm_api_key=engine_params.get("api_key", ""),
|
||||
)
|
||||
|
||||
self.reset()
|
||||
|
||||
def reset(self):
|
||||
|
||||
self.generator_agent = Agent(
|
||||
name="generator_agent",
|
||||
conf=self.generator_agent_config,
|
||||
system_prompt=GENERATOR_SYS_PROMPT,
|
||||
resp_parse_func=reps_action_result
|
||||
)
|
||||
|
||||
self.reflection_agent = Agent(
|
||||
name="reflection_agent",
|
||||
conf=self.generator_agent_config,
|
||||
system_prompt=REFLECTION_SYS_PROMPT,
|
||||
resp_parse_func=reps_action_result
|
||||
)
|
||||
|
||||
self.turn_count = 0
|
||||
self.worker_history = []
|
||||
self.reflections = []
|
||||
self.cost_this_turn = 0
|
||||
self.screenshot_inputs = []
|
||||
|
||||
self.dummy_task = Task()
|
||||
self.dummy_context = Context()
|
||||
self.dummy_context.set_task(self.dummy_task)
|
||||
self.dummy_message = Message(headers={'context': self.dummy_context})
|
||||
|
||||
self.planning_model = get_llm_model(self.generator_agent_config)
|
||||
|
||||
self.first_done = False
|
||||
self.first_image = None
|
||||
|
||||
def generate_next_action(
|
||||
self,
|
||||
instruction: str,
|
||||
obs: Dict,
|
||||
) -> Tuple[Dict, List]:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
agent = self.grounding_agent
|
||||
generator_message = (
|
||||
""
|
||||
if self.turn_count > 0
|
||||
else "The initial screen is provided. No action has been taken yet."
|
||||
)
|
||||
|
||||
# Load the task into the system prompt
|
||||
if self.turn_count == 0:
|
||||
self.generator_agent.system_prompt = self.generator_agent.system_prompt.replace(
|
||||
"TASK_DESCRIPTION", instruction)
|
||||
|
||||
# Get the per-step reflection
|
||||
reflection = None
|
||||
reflection_thoughts = None
|
||||
if self.enable_reflection:
|
||||
# Load the initial message
|
||||
if self.turn_count == 0:
|
||||
text_content = textwrap.dedent(
|
||||
f"""
|
||||
Task Description: {instruction}
|
||||
Current Trajectory below:
|
||||
"""
|
||||
)
|
||||
updated_sys_prompt = (
|
||||
self.reflection_agent.system_prompt + "\n" + text_content
|
||||
)
|
||||
self.reflection_agent.system_prompt = updated_sys_prompt
|
||||
|
||||
image_content = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"The initial screen is provided. No action has been taken yet."
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
}
|
||||
}
|
||||
]
|
||||
self.reflection_agent._init_context(context=self.dummy_context)
|
||||
|
||||
sync_exec(
|
||||
self.reflection_agent._add_human_input_to_memory,
|
||||
image_content,
|
||||
self.dummy_context,
|
||||
"message"
|
||||
)
|
||||
|
||||
# Load the latest action
|
||||
else:
|
||||
|
||||
image = "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
reflection_message = self.worker_history[-1] + "\n" + f"Here is function execute result: {obs['action_response']}.\n"
|
||||
|
||||
reflection_observation = Observation(content=reflection_message, image=image)
|
||||
|
||||
self.reflection_agent._init_context(context=self.dummy_context)
|
||||
reflection_actions = self.reflection_agent.policy(reflection_observation, message=self.dummy_message)
|
||||
|
||||
reflection = reflection_actions[0].action_name
|
||||
reflection_thoughts = reflection_actions[0].policy_info
|
||||
|
||||
self.reflections.append(reflection)
|
||||
|
||||
generator_message += f"Here is your function execute result: {obs['action_response']}.\n"
|
||||
|
||||
generator_message += f"REFLECTION: You may use this reflection on the previous action and overall trajectory:\n{reflection}\n"
|
||||
logger.info("REFLECTION: %s", reflection)
|
||||
|
||||
if self.first_done:
|
||||
pass
|
||||
|
||||
else:
|
||||
# Add finalized message to conversation
|
||||
generator_message += f"\nCurrent Text Buffer = [{','.join(agent.notes)}]\n"
|
||||
|
||||
image = "data:image/png;base64," + encode_image(obs["screenshot"])
|
||||
generator_observation = Observation(content=generator_message, image=image)
|
||||
|
||||
self.generator_agent._init_context(context=self.dummy_context)
|
||||
generator_actions = self.generator_agent.policy(generator_observation, message=self.dummy_message)
|
||||
|
||||
plan = generator_actions[0].action_name
|
||||
plan_thoughts = generator_actions[0].policy_info
|
||||
|
||||
prune_image_messages(self.generator_agent.memory.memory_store, 16)
|
||||
prune_image_messages(self.reflection_agent.memory.memory_store, 16)
|
||||
|
||||
self.worker_history.append(plan)
|
||||
|
||||
logger.info("FULL PLAN:\n %s", plan)
|
||||
|
||||
# self.generator_agent.add_message(plan, role="assistant")
|
||||
# Use the grounding agent to convert agent_action("desc") into agent_action([x, y])
|
||||
|
||||
try:
|
||||
agent.assign_coordinates(plan, obs)
|
||||
plan_code = parse_single_code_from_string(plan.split("Grounded Action")[-1])
|
||||
plan_code = sanitize_code(plan_code)
|
||||
plan_code = extract_first_agent_function(plan_code)
|
||||
exec_code = eval(plan_code)
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Error in parsing plan code: %s", e)
|
||||
plan_code = "agent.wait(1.0)"
|
||||
exec_code = eval(plan_code)
|
||||
|
||||
executor_info = {
|
||||
"full_plan": plan,
|
||||
"executor_plan": plan,
|
||||
"plan_thoughts": plan_thoughts,
|
||||
"plan_code": plan_code,
|
||||
"reflection": reflection,
|
||||
"reflection_thoughts": reflection_thoughts,
|
||||
}
|
||||
self.turn_count += 1
|
||||
|
||||
self.screenshot_inputs.append(obs["screenshot"])
|
||||
|
||||
return executor_info, [exec_code]
|
||||
1068
mm_agents/mano_agent.py
Normal file
1068
mm_agents/mano_agent.py
Normal file
File diff suppressed because it is too large
Load Diff
585
mm_agents/qwen3vl_agent.py
Normal file
585
mm_agents/qwen3vl_agent.py
Normal file
@@ -0,0 +1,585 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import os
|
||||
from io import BytesIO
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
import backoff
|
||||
import openai
|
||||
from PIL import Image
|
||||
from requests.exceptions import SSLError
|
||||
from google.api_core.exceptions import (
|
||||
InvalidArgument,
|
||||
ResourceExhausted,
|
||||
InternalServerError,
|
||||
BadRequest,
|
||||
)
|
||||
from mm_agents.utils.qwen_vl_utils import smart_resize
|
||||
|
||||
|
||||
logger = None
|
||||
|
||||
MAX_RETRY_TIMES = 5
|
||||
|
||||
|
||||
def encode_image(image_content):
|
||||
return base64.b64encode(image_content).decode("utf-8")
|
||||
|
||||
|
||||
def process_image(image_bytes):
|
||||
"""
|
||||
Process an image for Qwen VL models (thinking variant).
|
||||
Uses a tighter resize cap consistent with the thinking DUN agent.
|
||||
"""
|
||||
image = Image.open(BytesIO(image_bytes))
|
||||
width, height = image.size
|
||||
|
||||
resized_height, resized_width = smart_resize(
|
||||
height=height,
|
||||
width=width,
|
||||
factor=32,
|
||||
max_pixels=16 * 16 * 4 * 1280,
|
||||
)
|
||||
|
||||
image = image.resize((resized_width, resized_height))
|
||||
|
||||
buffer = BytesIO()
|
||||
image.save(buffer, format="PNG")
|
||||
processed_bytes = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(processed_bytes).decode("utf-8")
|
||||
|
||||
|
||||
class Qwen3VLAgent:
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
platform: str = "ubuntu",
|
||||
model: str = "qwen3-vl",
|
||||
max_tokens: int = 1500,
|
||||
top_p: float = 0.9,
|
||||
temperature: float = 0.0,
|
||||
action_space: str = "pyautogui",
|
||||
observation_type: str = "screenshot",
|
||||
history_n: int = 4,
|
||||
add_thought_prefix: bool = False,
|
||||
coordinate_type: str = "relative",
|
||||
):
|
||||
self.platform = platform
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.history_n = history_n
|
||||
self.add_thought_prefix = add_thought_prefix
|
||||
self.coordinate_type = coordinate_type
|
||||
|
||||
assert action_space in ["pyautogui"], "Invalid action space"
|
||||
assert observation_type in ["screenshot"], "Invalid observation type"
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.responses = []
|
||||
self.screenshots = []
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
Returns (response, pyautogui_code).
|
||||
"""
|
||||
screenshot_bytes = obs["screenshot"]
|
||||
|
||||
image = Image.open(BytesIO(screenshot_bytes))
|
||||
width, height = image.size
|
||||
print(f"Original screen resolution: {width}x{height}")
|
||||
|
||||
processed_image = process_image(screenshot_bytes)
|
||||
processed_img = Image.open(
|
||||
BytesIO(base64.b64decode(processed_image))
|
||||
)
|
||||
processed_width, processed_height = processed_img.size
|
||||
print(
|
||||
"Processed image resolution: "
|
||||
f"{processed_width}x{processed_height}"
|
||||
)
|
||||
|
||||
self.screenshots.append(processed_image)
|
||||
|
||||
current_step = len(self.actions)
|
||||
history_start_idx = max(0, current_step - self.history_n)
|
||||
|
||||
previous_actions = []
|
||||
for i in range(history_start_idx):
|
||||
if i < len(self.actions):
|
||||
previous_actions.append(f"Step {i+1}: {self.actions[i]}")
|
||||
previous_actions_str = (
|
||||
"\n".join(previous_actions) if previous_actions else "None"
|
||||
)
|
||||
|
||||
description_prompt_lines = [
|
||||
"Use a mouse and keyboard to interact with a computer, and take screenshots.",
|
||||
"* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.",
|
||||
"* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.",
|
||||
(
|
||||
f"* The screen's resolution is {processed_width}x{processed_height}."
|
||||
if self.coordinate_type == "absolute"
|
||||
else "* The screen's resolution is 1000x1000."
|
||||
),
|
||||
"* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.",
|
||||
"* If you tried clicking on a program or link but it failed to load even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.",
|
||||
"* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.",
|
||||
]
|
||||
description_prompt = "\n".join(description_prompt_lines)
|
||||
|
||||
action_description_prompt = """
|
||||
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
|
||||
* `type`: Type a string of text on the keyboard.
|
||||
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
|
||||
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.
|
||||
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
|
||||
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
||||
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
|
||||
* `wait`: Wait specified seconds for the change to happen.
|
||||
* `terminate`: Terminate the current task and report its completion status.
|
||||
* `answer`: Answer a question.
|
||||
"""
|
||||
|
||||
tools_def = {
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name_for_human": "computer_use",
|
||||
"name": "computer_use",
|
||||
"description": description_prompt,
|
||||
"parameters": {
|
||||
"properties": {
|
||||
"action": {
|
||||
"description": action_description_prompt,
|
||||
"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag",
|
||||
"right_click", "middle_click", "double_click", "scroll", "wait", "terminate"],
|
||||
"type": "string"
|
||||
},
|
||||
"keys": {"description": "Required only by `action=key`.", "type": "array"},
|
||||
"text": {"description": "Required only by `action=type`.", "type": "string"},
|
||||
"coordinate": {"description": "The x,y coordinates for mouse actions.", "type": "array"},
|
||||
"pixels": {"description": "The amount of scrolling.", "type": "number"},
|
||||
"time": {"description": "The seconds to wait.", "type": "number"},
|
||||
"status": {
|
||||
"description": "The status of the task.",
|
||||
"type": "string",
|
||||
"enum": ["success", "failure"]
|
||||
}
|
||||
},
|
||||
"required": ["action"],
|
||||
"type": "object"
|
||||
},
|
||||
"args_format": "Format the arguments as a JSON object."
|
||||
}
|
||||
}
|
||||
|
||||
system_prompt = """# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
""" + json.dumps(tools_def) + """
|
||||
</tools>
|
||||
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call>
|
||||
|
||||
# Response format
|
||||
|
||||
Response format for every step:
|
||||
1) Action: a short imperative describing what to do in the UI.
|
||||
2) A single <tool_call>...</tool_call> block containing only the JSON: {"name": <function-name>, "arguments": <args-json-object>}.
|
||||
|
||||
Rules:
|
||||
- Output exactly in the order: Action, <tool_call>.
|
||||
- Be brief: one sentence for Action.
|
||||
- Do not output anything else outside those parts.
|
||||
- If finishing, use action=terminate in the tool call."""
|
||||
|
||||
instruction_prompt = f"""
|
||||
Please generate the next move according to the UI screenshot, instruction and previous actions.
|
||||
|
||||
Instruction: {instruction}
|
||||
|
||||
Previous actions:
|
||||
{previous_actions_str}"""
|
||||
|
||||
messages = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{"type": "text", "text": system_prompt},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
history_len = min(self.history_n, len(self.responses))
|
||||
if history_len > 0:
|
||||
history_responses = self.responses[-history_len:]
|
||||
history_screenshots = self.screenshots[-history_len - 1:-1]
|
||||
|
||||
for idx in range(history_len):
|
||||
if idx < len(history_screenshots):
|
||||
screenshot_b64 = history_screenshots[idx]
|
||||
if idx == 0:
|
||||
img_url = f"data:image/png;base64,{screenshot_b64}"
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": img_url},
|
||||
},
|
||||
{"type": "text", "text": instruction_prompt},
|
||||
],
|
||||
}
|
||||
)
|
||||
else:
|
||||
img_url = f"data:image/png;base64,{screenshot_b64}"
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": img_url},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
messages.append(
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{"type": "text", "text": f"{history_responses[idx]}"},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
curr_img_url = f"data:image/png;base64,{processed_image}"
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": curr_img_url},
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
else:
|
||||
curr_img_url = f"data:image/png;base64,{processed_image}"
|
||||
messages.append(
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": curr_img_url},
|
||||
},
|
||||
{"type": "text", "text": instruction_prompt},
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# Debug: save messages before sending to model
|
||||
try:
|
||||
draft_dir = "./draft/message_cache"
|
||||
os.makedirs(draft_dir, exist_ok=True)
|
||||
message_file_path = os.path.join(
|
||||
draft_dir, f"messages_step_{current_step}.json"
|
||||
)
|
||||
with open(message_file_path, "w") as f:
|
||||
json.dump(messages, f)
|
||||
except Exception as _e: # do not fail prediction due to debug IO
|
||||
pass
|
||||
|
||||
response = self.call_llm(
|
||||
{
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"temperature": self.temperature,
|
||||
},
|
||||
self.model,
|
||||
)
|
||||
|
||||
logger.info(f"Qwen3VL Output: {response}")
|
||||
|
||||
self.responses.append(response)
|
||||
|
||||
low_level_instruction, pyautogui_code = self.parse_response(
|
||||
response,
|
||||
width,
|
||||
height,
|
||||
processed_width,
|
||||
processed_height,
|
||||
)
|
||||
|
||||
logger.info(f"Low level instruction: {low_level_instruction}")
|
||||
logger.info(f"Pyautogui code: {pyautogui_code}")
|
||||
|
||||
self.actions.append(low_level_instruction)
|
||||
|
||||
return response, pyautogui_code
|
||||
|
||||
def parse_response(
|
||||
self,
|
||||
response: str,
|
||||
original_width: int = None,
|
||||
original_height: int = None,
|
||||
processed_width: int = None,
|
||||
processed_height: int = None,
|
||||
) -> Tuple[str, List[str]]:
|
||||
"""
|
||||
Parse LLM response and convert it to low level action and pyautogui code.
|
||||
"""
|
||||
low_level_instruction = ""
|
||||
pyautogui_code: List[str] = []
|
||||
|
||||
if response is None or not response.strip():
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
def adjust_coordinates(x: float, y: float) -> Tuple[int, int]:
|
||||
if not (original_width and original_height):
|
||||
return int(x), int(y)
|
||||
if self.coordinate_type == "absolute":
|
||||
# scale from processed pixels to original
|
||||
if processed_width and processed_height:
|
||||
x_scale = original_width / processed_width
|
||||
y_scale = original_height / processed_height
|
||||
return int(x * x_scale), int(y * y_scale)
|
||||
return int(x), int(y)
|
||||
# relative: scale from 0..999 grid
|
||||
x_scale = original_width / 999
|
||||
y_scale = original_height / 999
|
||||
return int(x * x_scale), int(y * y_scale)
|
||||
|
||||
def process_tool_call(json_str: str) -> None:
|
||||
try:
|
||||
tool_call = json.loads(json_str)
|
||||
if tool_call.get("name") == "computer_use":
|
||||
args = tool_call["arguments"]
|
||||
action = args["action"]
|
||||
|
||||
if action == "left_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(f"pyautogui.click({adj_x}, {adj_y})")
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.click()")
|
||||
|
||||
elif action == "right_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.rightClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.rightClick()")
|
||||
|
||||
elif action == "middle_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.middleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.middleClick()")
|
||||
|
||||
elif action == "double_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.doubleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.doubleClick()")
|
||||
|
||||
elif action == "type":
|
||||
text = args.get("text", "")
|
||||
pyautogui_code.append(f"pyautogui.typewrite('{text}')")
|
||||
|
||||
elif action == "key":
|
||||
keys = args.get("keys", [])
|
||||
if isinstance(keys, list):
|
||||
cleaned_keys = []
|
||||
for key in keys:
|
||||
if isinstance(key, str):
|
||||
if key.startswith("keys=["):
|
||||
key = key[6:]
|
||||
if key.endswith("]"):
|
||||
key = key[:-1]
|
||||
if key.startswith("['") or key.startswith('["'):
|
||||
key = key[2:] if len(key) > 2 else key
|
||||
if key.endswith("']") or key.endswith('"]'):
|
||||
key = key[:-2] if len(key) > 2 else key
|
||||
key = key.strip()
|
||||
cleaned_keys.append(key)
|
||||
else:
|
||||
cleaned_keys.append(key)
|
||||
keys = cleaned_keys
|
||||
|
||||
keys_str = ", ".join([f"'{key}'" for key in keys])
|
||||
if len(keys) > 1:
|
||||
pyautogui_code.append(f"pyautogui.hotkey({keys_str})")
|
||||
else:
|
||||
pyautogui_code.append(f"pyautogui.press({keys_str})")
|
||||
|
||||
elif action == "scroll":
|
||||
pixels = args.get("pixels", 0)
|
||||
pyautogui_code.append(f"pyautogui.scroll({pixels})")
|
||||
|
||||
elif action == "wait":
|
||||
pyautogui_code.append("WAIT")
|
||||
|
||||
elif action == "terminate":
|
||||
pyautogui_code.append("DONE")
|
||||
|
||||
elif action == "mouse_move":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.moveTo({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.moveTo(0, 0)")
|
||||
|
||||
elif action == "left_click_drag":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
duration = args.get("duration", 0.5)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.dragTo({adj_x}, {adj_y}, duration={duration})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.dragTo(0, 0)")
|
||||
except (json.JSONDecodeError, KeyError) as e:
|
||||
logger.error(f"Failed to parse tool call: {e}")
|
||||
|
||||
lines = response.split("\n")
|
||||
inside_tool_call = False
|
||||
current_tool_call: List[str] = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
|
||||
if line.lower().startswith(("action:")):
|
||||
if not low_level_instruction:
|
||||
low_level_instruction = line.split("Action:")[-1].strip()
|
||||
continue
|
||||
|
||||
if line.startswith("<tool_call>"):
|
||||
inside_tool_call = True
|
||||
continue
|
||||
elif line.startswith("</tool_call>"):
|
||||
if current_tool_call:
|
||||
process_tool_call("\n".join(current_tool_call))
|
||||
current_tool_call = []
|
||||
inside_tool_call = False
|
||||
continue
|
||||
|
||||
if inside_tool_call:
|
||||
current_tool_call.append(line)
|
||||
continue
|
||||
|
||||
if line.startswith("{") and line.endswith("}"):
|
||||
try:
|
||||
json_obj = json.loads(line)
|
||||
if "name" in json_obj and "arguments" in json_obj:
|
||||
process_tool_call(line)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
if current_tool_call:
|
||||
process_tool_call("\n".join(current_tool_call))
|
||||
|
||||
if not low_level_instruction and len(pyautogui_code) > 0:
|
||||
action_type = pyautogui_code[0].split(".", 1)[1].split("(", 1)[0]
|
||||
low_level_instruction = f"Performing {action_type} action"
|
||||
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.constant,
|
||||
(
|
||||
SSLError,
|
||||
openai.RateLimitError,
|
||||
openai.BadRequestError,
|
||||
openai.InternalServerError,
|
||||
InvalidArgument,
|
||||
ResourceExhausted,
|
||||
InternalServerError,
|
||||
BadRequest,
|
||||
),
|
||||
interval=30,
|
||||
max_tries=5,
|
||||
)
|
||||
def call_llm(self, payload, model):
|
||||
messages = payload["messages"]
|
||||
|
||||
base_url = "https://poc-dashscope.aliyuncs.com/compatible-mode/v1"
|
||||
api_key = "sk-123"
|
||||
client = openai.OpenAI(base_url=base_url, api_key=api_key)
|
||||
|
||||
for _ in range(MAX_RETRY_TIMES):
|
||||
logger.info("Generating content with Qwen model: %s", model)
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
max_tokens=self.max_tokens,
|
||||
temperature=self.temperature,
|
||||
top_p=self.top_p,
|
||||
)
|
||||
return response.choices[0].message.content
|
||||
except Exception as e:
|
||||
logger.error(f"Error calling Qwen model: {e}")
|
||||
time.sleep(5)
|
||||
continue
|
||||
return ""
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = (
|
||||
_logger if _logger is not None
|
||||
else logging.getLogger("desktopenv.qwen3vl_agent")
|
||||
)
|
||||
|
||||
self.thoughts = []
|
||||
self.action_descriptions = (
|
||||
[] if hasattr(self, "action_descriptions") else []
|
||||
)
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.responses = []
|
||||
self.screenshots = []
|
||||
|
||||
|
||||
71
mm_agents/uipath/README.md
Normal file
71
mm_agents/uipath/README.md
Normal file
@@ -0,0 +1,71 @@
|
||||
# UiPath Screen Agent
|
||||
|
||||
We propose a simple, yet effective implementation of a Computer Use Agent, which achieves a performance of **53.6%** on the **OSWorld** benchmark with 50 steps, demonstrating competitive results with a relatively lightweight setup and UI only actions.
|
||||
|
||||
Our system builds upon recent approaches in agentic computer use and follows the literature in adopting a two-stage architecture that separates high-level reasoning from low-level execution. Specifically, the system is composed of:
|
||||
|
||||
- **Action Planner (GPT-5)**: Responsible for generating high-level action sequences, reasoning about task goals and observing modifications in the environment.
|
||||
- **Grounder (UI-TARS 1.5 + Internal UI Predictor)**: This component translates abstract plans into concrete interactions with the user interface. The UI-TARS 1.5 serves as the grounding mechanism, mapping planned actions to locations on screen, while the Internal UI Predictor assists in resolving ambiguous predictions, increasing the robustness and probability of the predictions to fall within UI elements.
|
||||
|
||||

|
||||
|
||||
## Run
|
||||
```
|
||||
python run_multienv_uipath.py \
|
||||
--provider_name docker \
|
||||
--observation_type screenshot \
|
||||
--model uipath_gpt_5 \
|
||||
--sleep_after_execution 3 \
|
||||
--max_steps 50 \
|
||||
--num_envs 10 \
|
||||
--action_space computer_13 \
|
||||
--client_password password \
|
||||
--test_all_meta_path evaluation_examples/test_all.json \
|
||||
--uipath_model_name gpt-5-2025-08-07
|
||||
```
|
||||
|
||||
## Action Planner
|
||||
The Action Planner receives the current screenshot, a task description, and a history of previous steps - including past screenshots, observations, internal reasoning, and predicted actions. Its role is to plan the next steps toward achieving the task goal, anticipate changes in the environment, and determine the next action, providing clear reasoning for each decision.
|
||||
|
||||
The interaction history is structured as a conversation: the user reports the task, executed actions, supplies recent screenshots (up to the last two), and notes previously predicted outcomes of the agent, while the assistant replies consist of previously predicted agent responses. We adopt this conversational format because it mirrors the dialogue-style data the model was originally trained on, making the setup both natural and effective.
|
||||
|
||||
By combining the current state with this structured history, the Action Planner generates context-aware, informed predictions at every step, being able to reconstruct the sequence of actions that led him to this point, noticing eventual failures, and plan the subsequent steps.
|
||||
|
||||
We support a concise set of actions for interacting with the environment, focusing specifically on UI-related activities:
|
||||
- Click (left, right, double click)
|
||||
- Type
|
||||
- Scroll
|
||||
- Drag
|
||||
- Mouse move
|
||||
- Key press
|
||||
- Extract data – a pseudo-action used to capture information for later steps
|
||||
- Finish
|
||||
|
||||
To help the planner model understand how to effectively apply actions, we provide them through few-shot examples.
|
||||
|
||||
We intentionally exclude more complex actions to isolate and evaluate the capabilities of a UI-focused agent, since certain advanced actions may not be applicable across all applications.
|
||||
|
||||
## Grounder
|
||||
The Grounder maps the action to a certain point on the screen, if needed (for actions such as click, scroll, drag). It receives the screenshot, description of action to be performed and the type of the actions and returns a pair of integers representing the screen coordinates.
|
||||
|
||||
We utilized the `UI-TARS-1.5` model, which has amazing screen knowledge and capabilities, however, to ensure more confidence in the predicted coordinates, we employ a crop-and-refine method, using an internal UI element predictor.
|
||||
|
||||
### Crop and refine
|
||||
We wrap the prediction of the grounding model with our internal UI element predictor. The goal of this step is not to guarantee that the prediction will always fall within an identified element, but to increase the likelihood of alignment and to give the model an opportunity to refine its output.
|
||||
|
||||
The UI element predictor consists of a shared feature extractor backbone and multiple prediction towers for:
|
||||
- identifying UI elements or controls such as icons, input boxes, checkboxes, buttons, radio buttons
|
||||
- tables and cells
|
||||
- few other tasks not used for our approach, but employed in other use-cases and needed in training for improving the feature extractor performance
|
||||
|
||||

|
||||
|
||||
In most interfaces, actions are expected to interact directly with UI elements: buttons, fields, icons, or menus. When a prediction lands outside any element, this often signals a potential misprediction. While there are legitimate cases where a click outside elements makes sense (e.g., dismissing a modal, dragging to select text, or changing window focus), they are exceptions rather than the rule. By treating these situations as possible errors, we can provide the model with a structured way to reconsider its output.
|
||||
|
||||
Our approach is to give the model a “second shot” when its initial prediction falls outside an identified element. We do this by cropping around the former prediction and running the prediction again. This retry doesn’t guarantee correctness, but it does give the model a chance to adjust and potentially recover from mistakes. We crop around the original coordinates including close UI elements.
|
||||
|
||||
This process gives the model multiple opportunities to predict within a relevant zone of the interface, reducing the overall number of mispredictions. In our experiments, the grounding model placed predictions outside any UI element about 11% of the time. After applying our refinement step, the second prediction was always different from the original, demonstrating that the model does reconsider and “changes its mind” when given this guided feedback.
|
||||
|
||||
## Conclusion
|
||||
Our method offers a clean and simple yet competitive pipeline for Computer Use tasks. It is cost effective, minimizing token usage during planning, avoiding parallel planning and reliance on numerous past images, and incorporate only **direct UI actions** with refined grounding actions to improve accuracy. With this approach, we achieve **53.6%** accuracy on OSWorld with a 50-step horizon.
|
||||
|
||||
288
mm_agents/uipath/action_planner.py
Normal file
288
mm_agents/uipath/action_planner.py
Normal file
@@ -0,0 +1,288 @@
|
||||
import datetime
|
||||
import json
|
||||
from collections import OrderedDict
|
||||
import time
|
||||
import mm_agents.uipath.llm_client as llm_client
|
||||
from mm_agents.uipath.types_utils import (
|
||||
PlanAction,
|
||||
ExecutionState,
|
||||
State,
|
||||
PlanActionType,
|
||||
)
|
||||
from mm_agents.uipath.action_planner_prompt_builder import (
|
||||
ComputerUseAgentInterface,
|
||||
PlanerCoTSections,
|
||||
user_command_template,
|
||||
user_task_info_template,
|
||||
PlannerOutput,
|
||||
)
|
||||
from mm_agents.uipath.utils import ValidationException, parse_message_json
|
||||
|
||||
|
||||
class ActionPlanner(object):
|
||||
def __init__(self):
|
||||
self.number_history_steps_with_images = 2
|
||||
self.computer_use_agent_interface = ComputerUseAgentInterface()
|
||||
|
||||
def build_message_output_format_info(self) -> str:
|
||||
output_dict = OrderedDict({})
|
||||
for _, value in PlanerCoTSections.items():
|
||||
display = value["display"]
|
||||
description = value["description"]
|
||||
output_dict[display] = description
|
||||
|
||||
output_dict["action"] = (
|
||||
"<The action to perform in JSON format as specified in the system message>"
|
||||
)
|
||||
|
||||
return json.dumps(output_dict, indent=4, ensure_ascii=False)
|
||||
|
||||
def get_step_content(
|
||||
self, step: dict, following_step: dict | None
|
||||
) -> tuple[str, str]:
|
||||
content_dict = OrderedDict({})
|
||||
observation_dict = OrderedDict({})
|
||||
|
||||
observation_dict["Performed actions"] = step["actions"]
|
||||
|
||||
if (
|
||||
"extracted_data" in step["additional_parameters"]
|
||||
): # if the step was an extraction step add the dummy extraction action
|
||||
extraction_action = {
|
||||
"type": PlanActionType.ExtractData,
|
||||
"description": step["description"],
|
||||
"status": "data extracted",
|
||||
}
|
||||
observation_dict["Performed actions"] = [extraction_action]
|
||||
|
||||
if following_step:
|
||||
observation_dict["Observation"] = following_step[
|
||||
"additional_parameters"
|
||||
].get("review", None)
|
||||
|
||||
for key, value in PlanerCoTSections.items():
|
||||
if key != "review":
|
||||
param_value = step["additional_parameters"].get(key, None)
|
||||
display_name = value["display"]
|
||||
content_dict[display_name] = param_value
|
||||
content_dict["actions"] = json.loads(
|
||||
step["additional_parameters"]["plan_action"]
|
||||
)
|
||||
|
||||
content_dict = json.dumps(content_dict, indent=4, ensure_ascii=False)
|
||||
observation_dict = json.dumps(observation_dict, indent=4, ensure_ascii=False)
|
||||
return content_dict, observation_dict
|
||||
|
||||
def build_messages_chat(self, state: State, execution_info: dict) -> list[dict]:
|
||||
messages = []
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": self.computer_use_agent_interface.get_system_prompt(),
|
||||
}
|
||||
|
||||
messages.append(system_message)
|
||||
|
||||
user_task_info_message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_task_info_template.format(
|
||||
task=state.task,
|
||||
current_date=datetime.datetime.now().strftime("%Y-%m-%d"),
|
||||
),
|
||||
}
|
||||
],
|
||||
}
|
||||
|
||||
messages.append(user_task_info_message)
|
||||
|
||||
start_index = max(
|
||||
0, len(state.previous_steps) - self.number_history_steps_with_images
|
||||
)
|
||||
end_index = len(state.previous_steps)
|
||||
|
||||
for index in range(0, end_index):
|
||||
step = state.previous_steps[index]
|
||||
|
||||
if index >= start_index:
|
||||
assert step["image"] is not None and len(step["image"]) > 0, (
|
||||
"Step image is empty"
|
||||
)
|
||||
user_image_message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{step['image']}"
|
||||
},
|
||||
},
|
||||
],
|
||||
}
|
||||
messages.append(user_image_message)
|
||||
|
||||
assistant_message_text, user_observation = self.get_step_content(
|
||||
step, state.previous_steps[index + 1] if index < end_index - 1 else None
|
||||
)
|
||||
|
||||
assistant_message = {
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": assistant_message_text,
|
||||
},
|
||||
],
|
||||
}
|
||||
messages.append(assistant_message)
|
||||
|
||||
user_message_reply = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_observation,
|
||||
},
|
||||
],
|
||||
}
|
||||
messages.append(user_message_reply)
|
||||
|
||||
last_user_message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Current screenshot:",
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{state.image_base64}"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "text",
|
||||
"text": user_command_template.format(
|
||||
task=state.task,
|
||||
execution_info_message=self.build_execution_info_message(
|
||||
execution_info
|
||||
),
|
||||
json_output_format=self.build_message_output_format_info(),
|
||||
),
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
messages.append(last_user_message)
|
||||
return messages
|
||||
|
||||
def extract_response(
|
||||
self, response_content: str
|
||||
) -> tuple[PlanAction, dict[str, str]]:
|
||||
cot_sections_lst = list(PlanerCoTSections.keys())
|
||||
|
||||
additional_sections = OrderedDict({})
|
||||
response_json = parse_message_json(response_content)
|
||||
|
||||
for section in cot_sections_lst:
|
||||
section_display = PlanerCoTSections[section]["display"]
|
||||
if section_display not in response_json:
|
||||
raise ValidationException(
|
||||
f"Invalid response format, '{section}' key not found: {response_content}"
|
||||
)
|
||||
additional_sections[section] = response_json.get(
|
||||
PlanerCoTSections[section]["display"]
|
||||
)
|
||||
|
||||
if "action" not in response_json:
|
||||
raise ValidationException(
|
||||
f"Invalid response format, 'action' key not found: {response_content}"
|
||||
)
|
||||
|
||||
action_dict = response_json["action"]
|
||||
|
||||
plan_action = PlanAction.from_dict(self.correct_action_type(action_dict))
|
||||
|
||||
if plan_action.action_type == PlanActionType.Drag:
|
||||
self.computer_use_agent_interface.validate_action(plan_action)
|
||||
|
||||
return plan_action, additional_sections
|
||||
|
||||
def build_execution_info_message(self, execution_info: dict) -> str:
|
||||
execution_info_message = ""
|
||||
if "planner_action_review" in execution_info:
|
||||
action_description = execution_info["planner_action_review"][
|
||||
"action_description"
|
||||
]
|
||||
error_message = execution_info["planner_action_review"]["error_message"]
|
||||
|
||||
execution_info_message = f"You predicted this action: '{action_description}' but it is not valid because: {error_message}. If the target element is not visible on the screenshot, scroll first to make the target element visible. If the target element is not correct, change the action description with more precise element description using nearby context."
|
||||
return execution_info_message
|
||||
|
||||
def correct_action_type(self, response_json: dict) -> dict:
|
||||
action_type = response_json.get("type", "").lower()
|
||||
if action_type in ("press", "key_press", "press_key"):
|
||||
response_json["type"] = "key_press"
|
||||
elif action_type in ("mouse_move", "move_mouse"):
|
||||
response_json["type"] = "move_mouse"
|
||||
elif action_type in ("type_text", "type_into", "type"):
|
||||
response_json["type"] = "type"
|
||||
elif "scroll" in action_type:
|
||||
response_json["type"] = "scroll"
|
||||
elif "wait" in action_type:
|
||||
response_json["type"] = "wait"
|
||||
return response_json
|
||||
|
||||
def predict(self, state: State, execution_state: ExecutionState) -> PlannerOutput:
|
||||
messages = self.build_messages_chat(state, execution_state.execution_info)
|
||||
llm_messages = [message for message in messages]
|
||||
repeat_count = 2
|
||||
plan, response_content = None, None
|
||||
while repeat_count > 0:
|
||||
try:
|
||||
payload = {
|
||||
"model": execution_state.model_name,
|
||||
"messages": llm_messages,
|
||||
"max_completion_tokens": 5000,
|
||||
"reasoning_effort": "medium",
|
||||
}
|
||||
response_content = llm_client.send_messages(payload)
|
||||
if response_content is None or len(response_content.strip()) == 0:
|
||||
raise ValidationException("Planner response is None or empty")
|
||||
plan_action, additional_sections = self.extract_response(
|
||||
str(response_content)
|
||||
)
|
||||
plan = PlannerOutput(plan_action, additional_sections)
|
||||
break
|
||||
except ValidationException as e:
|
||||
time.sleep(5)
|
||||
repeat_count -= 1
|
||||
ai_message = {
|
||||
"role": "assistant",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": response_content,
|
||||
},
|
||||
],
|
||||
}
|
||||
error_message = {
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"{e.message}. Please try again and output a valid response in the correct format.",
|
||||
},
|
||||
],
|
||||
}
|
||||
|
||||
llm_messages = messages + [ai_message, error_message]
|
||||
|
||||
if repeat_count == 0:
|
||||
raise ValueError(
|
||||
f"Invalid planner response format: {response_content}, {str(e)}"
|
||||
)
|
||||
if plan is None:
|
||||
raise ValueError("Planner response is not valid")
|
||||
return plan
|
||||
390
mm_agents/uipath/action_planner_prompt_builder.py
Normal file
390
mm_agents/uipath/action_planner_prompt_builder.py
Normal file
@@ -0,0 +1,390 @@
|
||||
from collections import OrderedDict
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Any, Dict, List, Optional
|
||||
from mm_agents.uipath.types_utils import PlanAction, key_maps
|
||||
from mm_agents.uipath.utils import ValidationException
|
||||
|
||||
system_template = """You are a computer use agent that perform computer-related tasks.
|
||||
You will be given a task, a current screenshot, and a list of previous actions. You need to predict the next action.
|
||||
|
||||
## Available Actions:
|
||||
{available_actions}
|
||||
|
||||
In addition there are some special actions that are not part of the main UI actions:
|
||||
{special_actions}
|
||||
|
||||
Each action has a description and parameters. The action description is a single sentence which mentions the action and the control element to interact with.
|
||||
This description will be used by the executor agent to locate the action's target element coordinates in the screen, so describe the element targeted by the action as detailed as possible.
|
||||
Particularly for icons, you can describe their position, text on it, color, nearby elements etc...
|
||||
Example of some action descriptions with more detailed information to help the executor agent locate the element:
|
||||
- "Click on the Calendar icon with the text 'Thu 28'"
|
||||
- "Click the 'Search' button on the top right corner next to the login button."
|
||||
- "Click the 'First Name' input box from the UserInfo section to focus it before typing."
|
||||
|
||||
Your action response must be a valid JSON with the following format:
|
||||
{{
|
||||
"type": str # one of the valid action types
|
||||
"description": # action description
|
||||
"parameters": # optional, action parameters dictionary
|
||||
}}
|
||||
|
||||
## Action examples: example of valid actions:
|
||||
{examples}
|
||||
|
||||
## Important Notes:
|
||||
- Close any cookies, ads, login or registration etc pop-ups if not needed.
|
||||
- Before typing, ensure the input box is focused by clicking on it.
|
||||
"""
|
||||
|
||||
user_command_template = """Recall Task Again: {task}
|
||||
Check if the task is finished. If not provide the next action to perform.
|
||||
Remember:
|
||||
- Perform the task on provided application(s) or website(s). You are not allowed to use the browser "address bar".
|
||||
- Close any cookies, ads, login or registration etc pop-ups if not needed.
|
||||
- Only one action at a time (never "click and type", "click and drag", "type and press", "press shift and click", etc..). Think of how to combine them in two consecutive actions obtaining the intended result or use an available action that can obtain it.
|
||||
- For any opening input combobox, dropdown menu options, you must select an option or press Enter key to select default one.
|
||||
- Click on input box to ensure is focused before typing. Otherwise, the input box will not accept the text.
|
||||
- Once focusing on an input box, if it has a default pre-typed value (not placeholder which is usually grayed-out), remove the existing value first by clicking on "X" icon or using "Ctrl A" + "Backspace" or "Backspace" if the value is already selected.
|
||||
- For search input, if no search button or suggestions popup after typing, press 'Enter' to trigger search.
|
||||
- Retry the drag action on slider control if needed to refine the slider values closer to expected values.
|
||||
- Scroll / Pageup / Pagedown to explore or extract more content/data if needed (prefer 'key_press' action with key 'Pageup', 'Pagedown' for faster scrolling). Particularly when extraction data from table with hidden rows or columns.
|
||||
- Scroll action must have a 'direction' parameter. Finish action must have a 'status' parameter.
|
||||
- If you modify some settings remember to save/apply them. If button is not visible try to scroll for it.
|
||||
|
||||
Most importantly, never type or click on element not visible on screenshot. Use scroll or pageup/pagedown to make the element visible first.
|
||||
|
||||
{execution_info_message}
|
||||
Answer in json format:
|
||||
{json_output_format}
|
||||
"""
|
||||
|
||||
PlanerCoTSections = OrderedDict(
|
||||
{
|
||||
"review": {
|
||||
"display": "previous_action_result",
|
||||
"description": "Briefly describe the previous action result and UI change on the screenshot to see if is correctly performed.",
|
||||
},
|
||||
"thought": {
|
||||
"display": "thought",
|
||||
"description": "Reason briefly about the next action to perform if the task is not finished.",
|
||||
},
|
||||
"action_description": {
|
||||
"display": "action_description",
|
||||
"description": "Describe the action to perform in a single sentence. The description must be precise and not rely on specific information in the current screen.",
|
||||
},
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
### for chat conversation
|
||||
user_task_info_template = """## Task Information:
|
||||
The current date is (YYYY-MM-DD): {current_date}
|
||||
Task: {task}
|
||||
"""
|
||||
|
||||
|
||||
@dataclass
|
||||
class ActionDefinition:
|
||||
type: str
|
||||
description: str
|
||||
parameters: Optional[Dict[str, str]] = None
|
||||
examples: List[Dict[str, Any]] = field(default_factory=list)
|
||||
|
||||
|
||||
class PlannerOutput(object):
|
||||
def __init__(self, plan_action: PlanAction, additional_sections: dict[str, str]):
|
||||
self.plan_action = plan_action
|
||||
self.thought = additional_sections["thought"]
|
||||
self.review = additional_sections["review"]
|
||||
self.additional_sections = {
|
||||
key: value
|
||||
for key, value in additional_sections.items()
|
||||
if key not in ["review", "thought"]
|
||||
}
|
||||
|
||||
|
||||
class ComputerUseAgentInterface:
|
||||
def __init__(self):
|
||||
self.ui_actions = {}
|
||||
self.special_actions = {}
|
||||
self._setup_default_actions()
|
||||
|
||||
def _setup_default_actions(self):
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="click",
|
||||
description="Click on a UI element",
|
||||
examples=[
|
||||
{"type": "click", "description": "Click the 'Next' button."},
|
||||
{
|
||||
"type": "click",
|
||||
"description": "Click the 'X' icon in the input box",
|
||||
},
|
||||
{
|
||||
"type": "click",
|
||||
"description": "Click the first name input box to focus on it.",
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="right_click",
|
||||
description="Right click on a UI element",
|
||||
examples=[
|
||||
{
|
||||
"type": "right_click",
|
||||
"description": "Right click on the first row from the patient table to open the context menu.",
|
||||
}
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="double_click",
|
||||
description="Double click on a UI element",
|
||||
examples=[
|
||||
{
|
||||
"type": "double_click",
|
||||
"description": "Double click word app icon to open the application.",
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="type",
|
||||
description="Type text into a focused input field. Ensure the input box is focused before typing. To focus the input box, you may need to click on it first.",
|
||||
parameters={"text": "str - the text to be typed"},
|
||||
examples=[
|
||||
{
|
||||
"type": "type",
|
||||
"description": "Type 'John' in the first name input box.",
|
||||
"parameters": {"text": "John"},
|
||||
},
|
||||
{
|
||||
"type": "type",
|
||||
"description": "Type 'Doe' in the last name input box.",
|
||||
"parameters": {"text": "Doe"},
|
||||
},
|
||||
{
|
||||
"type": "type",
|
||||
"description": "Type 'Hello, world!' in the text area.",
|
||||
"parameters": {"text": "Hello, world!"},
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="scroll",
|
||||
description="Scroll an UI element in a specified direction",
|
||||
parameters={
|
||||
"direction": "str - 'up', 'down', 'left', or 'right'",
|
||||
"distance": "int - the number of scroll steps (wheel “clicks”) to send.",
|
||||
},
|
||||
examples=[
|
||||
{
|
||||
"type": "scroll",
|
||||
"description": "Scroll down to see more content.",
|
||||
"parameters": {"direction": "down"},
|
||||
},
|
||||
{
|
||||
"type": "scroll",
|
||||
"description": "Scroll up to the top of the page.",
|
||||
"parameters": {"direction": "up"},
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="drag",
|
||||
description="Drag an element or the mouse (with left click on) from one location to another. You must specify both start_description and end_description.",
|
||||
parameters={
|
||||
"start_description": "description of the location to start dragging",
|
||||
"end_description": "description of the location to drag to",
|
||||
},
|
||||
examples=[
|
||||
{
|
||||
"type": "drag",
|
||||
"description": "Drag the response.txt file to the responses folder",
|
||||
"start_description": "Click the response.txt file",
|
||||
"end_description": "Click the responses folder",
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="mouse_move",
|
||||
description="Move the mouse to a specific element",
|
||||
examples=[
|
||||
{
|
||||
"type": "mouse_move",
|
||||
"description": "Move the mouse to the 'Submit' button.",
|
||||
},
|
||||
{
|
||||
"type": "mouse_move",
|
||||
"description": "Hover over the 'Settings' icon.",
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_action(
|
||||
ActionDefinition(
|
||||
type="key_press",
|
||||
description="Press a specific key on the keyboard",
|
||||
parameters={
|
||||
"key": f'str # the key or key combination (separated by space) to be pressed. Example of key combination "Ctrl A", "Shift Tab", "Ctrl C" etc. "<Key> + Click" is not a valid combination, use two separate actions. Beside normal keys like letters, numerics, punctuations etc.. here are special key list: {key_maps.keys()}.'
|
||||
},
|
||||
examples=[
|
||||
{
|
||||
"type": "key_press",
|
||||
"description": "Press 'Ctrl A' to select all text.",
|
||||
"parameters": {"key": "Ctrl A"},
|
||||
},
|
||||
{
|
||||
"type": "key_press",
|
||||
"description": "Press Pagedown key.",
|
||||
"parameters": {"key": "Pagedown"},
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_special_action(
|
||||
ActionDefinition(
|
||||
type="extract_data",
|
||||
description="Use to extract some data from the screen for the task. This data will be stored in memory and used in the next actions or returned in the final result.",
|
||||
parameters={
|
||||
"description": "str - short description of the data to be extracted",
|
||||
"data": "str|json - the data to be extracted",
|
||||
},
|
||||
examples=[
|
||||
{
|
||||
"type": "extract_data",
|
||||
"description": "Extract the product name and price from the screen.",
|
||||
"parameters": {
|
||||
"description": "Available product name and price",
|
||||
"data": "Product Name: iPhone 14, Price: $999",
|
||||
},
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
self.add_special_action(
|
||||
ActionDefinition(
|
||||
type="finish",
|
||||
description=" Use it to finish the task with success or failure status. When you think the task was finished return success, while when you think can not be done, return failure, don't easily say failure, try your best to do the task.",
|
||||
parameters={"status": "str - 'success' or 'failure'"},
|
||||
examples=[
|
||||
{
|
||||
"type": "finish",
|
||||
"description": "Task completed successfully.",
|
||||
"parameters": {"status": "success"},
|
||||
},
|
||||
],
|
||||
)
|
||||
)
|
||||
|
||||
def add_action(self, action: ActionDefinition):
|
||||
self.ui_actions[action.type] = action
|
||||
|
||||
def add_special_action(self, action: ActionDefinition):
|
||||
self.special_actions[action.type] = action
|
||||
|
||||
def get_action_definition(self, action_type: str) -> Optional[ActionDefinition]:
|
||||
return self.ui_actions.get(action_type) or self.special_actions.get(action_type)
|
||||
|
||||
def validate_action(self, action: PlanAction):
|
||||
action_definition = self.get_action_definition(action.action_type)
|
||||
if action_definition is None:
|
||||
raise ValidationException(f"Invalid action type: {action.action_type}")
|
||||
|
||||
if action_definition.parameters:
|
||||
for parameter in action_definition.parameters:
|
||||
if parameter not in action.parameters:
|
||||
raise ValidationException(
|
||||
f"Missing parameter '{parameter}' in action: {action}"
|
||||
)
|
||||
|
||||
def get_system_prompt(self) -> str:
|
||||
indentation = " "
|
||||
|
||||
def get_action_definition(action: ActionDefinition) -> str:
|
||||
action_prompt = f"- {action.type}: {action.description}"
|
||||
if action.parameters is not None and len(action.parameters) > 0:
|
||||
params = (",\n" + 2 * indentation).join(
|
||||
f"{k}: {v}" for k, v in action.parameters.items()
|
||||
)
|
||||
parameter_def = (
|
||||
f"{indentation}parameters:\n{indentation}{indentation}{params}"
|
||||
)
|
||||
action_prompt += "\n" + parameter_def
|
||||
return action_prompt
|
||||
|
||||
def get_examples(actions: List[ActionDefinition]) -> list[str]:
|
||||
output_examples = []
|
||||
for action in actions:
|
||||
for example in action.examples:
|
||||
example_type = example["type"]
|
||||
example_description = example["description"]
|
||||
type_str = f'"type": "{example_type}"'
|
||||
description_str = f'"description": "{example_description}"'
|
||||
example_parts = [type_str, description_str]
|
||||
|
||||
if "parameters" in example:
|
||||
params = (",\n" + 2 * indentation).join(
|
||||
f'"{k}": "{v}"' for k, v in example["parameters"].items()
|
||||
)
|
||||
parameters_str = (
|
||||
'"parameters"'
|
||||
+ ": {\n"
|
||||
+ 2 * indentation
|
||||
+ params
|
||||
+ "\n"
|
||||
+ indentation
|
||||
+ "}"
|
||||
)
|
||||
example_parts.append(parameters_str)
|
||||
example_json = (
|
||||
"{\n"
|
||||
+ indentation
|
||||
+ (",\n" + indentation).join(example_parts)
|
||||
+ "\n}"
|
||||
)
|
||||
output_examples.append(example_json)
|
||||
|
||||
return output_examples
|
||||
|
||||
available_actions = "\n\n".join(
|
||||
get_action_definition(action) for action in self.ui_actions.values()
|
||||
)
|
||||
special_actions = "\n\n".join(
|
||||
get_action_definition(action) for action in self.special_actions.values()
|
||||
)
|
||||
examples = "\n\n".join(
|
||||
get_examples(
|
||||
list(self.ui_actions.values()) + list(self.special_actions.values())
|
||||
)
|
||||
)
|
||||
|
||||
return system_template.format(
|
||||
available_actions=available_actions,
|
||||
special_actions=special_actions,
|
||||
examples=examples,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
agent = ComputerUseAgentInterface()
|
||||
print(agent.get_system_prompt())
|
||||
223
mm_agents/uipath/agent.py
Normal file
223
mm_agents/uipath/agent.py
Normal file
@@ -0,0 +1,223 @@
|
||||
import json
|
||||
from mm_agents.uipath.types_utils import (
|
||||
ComputerUseAction,
|
||||
ComputerUseStep,
|
||||
SupportedActions,
|
||||
PlanActionType,
|
||||
PlanAction,
|
||||
key_maps,
|
||||
ExecutionState,
|
||||
State,
|
||||
)
|
||||
import mm_agents.uipath.utils as utils
|
||||
from mm_agents.uipath.action_planner import ActionPlanner, PlannerOutput
|
||||
from mm_agents.uipath.grounder_client import GrounderClient
|
||||
|
||||
|
||||
class UiPathComputerUseV1(object):
|
||||
def __init__(self):
|
||||
self.planner = ActionPlanner()
|
||||
self.executor = GrounderClient()
|
||||
|
||||
async def predict_request(
|
||||
self, request_body: dict, model_name: str
|
||||
) -> tuple[dict, dict]:
|
||||
state = State(
|
||||
task=request_body["userTask"],
|
||||
image_base64=request_body["image"],
|
||||
previous_steps=request_body.get("previousSteps", []),
|
||||
)
|
||||
|
||||
execution_state = ExecutionState(model_name=model_name, execution_info={})
|
||||
output = await self.predict(state, execution_state)
|
||||
return output
|
||||
|
||||
def process_grounding(
|
||||
self,
|
||||
plan_action: PlanAction,
|
||||
grounding_result: utils.GroundingOutput,
|
||||
x: int,
|
||||
y: int,
|
||||
):
|
||||
match plan_action.action_type:
|
||||
case PlanActionType.Scroll:
|
||||
# guess the scroll direction if missing in the plan output
|
||||
if "direction" not in plan_action.parameters:
|
||||
if "scroll up" in plan_action.description.lower():
|
||||
scroll_direction = "up"
|
||||
else:
|
||||
scroll_direction = "down"
|
||||
else:
|
||||
scroll_direction = plan_action.parameters["direction"]
|
||||
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.Scroll,
|
||||
description=plan_action.description,
|
||||
parameters={"position": [x, y], "direction": scroll_direction},
|
||||
)
|
||||
|
||||
if "distance" in plan_action.parameters:
|
||||
match scroll_direction:
|
||||
case "up":
|
||||
action.parameters["offset"] = [
|
||||
0,
|
||||
plan_action.parameters["distance"],
|
||||
]
|
||||
case "down":
|
||||
action.parameters["offset"] = [
|
||||
0,
|
||||
-plan_action.parameters["distance"],
|
||||
]
|
||||
case "left":
|
||||
action.parameters["offset"] = [
|
||||
plan_action.parameters["distance"],
|
||||
0,
|
||||
]
|
||||
case "right":
|
||||
action.parameters["offset"] = [
|
||||
-plan_action.parameters["distance"],
|
||||
0,
|
||||
]
|
||||
case PlanActionType.Drag:
|
||||
assert grounding_result.end_position is not None, (
|
||||
"End position must be provided for drag action"
|
||||
)
|
||||
x_end, y_end = grounding_result.end_position
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.Drag,
|
||||
description=plan_action.description,
|
||||
parameters={
|
||||
"path": [
|
||||
{"x": x, "y": y},
|
||||
{"x": x_end, "y": y_end},
|
||||
]
|
||||
},
|
||||
)
|
||||
case _:
|
||||
action_name = plan_action.action_type
|
||||
parameters = {"position": [x, y]}
|
||||
|
||||
if plan_action.action_type == PlanActionType.DoubleClick:
|
||||
action_name = SupportedActions.Click
|
||||
parameters["click_type"] = "double"
|
||||
elif plan_action.action_type == PlanActionType.RightClick:
|
||||
action_name = SupportedActions.Click
|
||||
parameters["button"] = "right"
|
||||
elif plan_action.action_type == PlanActionType.MouseMove:
|
||||
action_name = SupportedActions.MouseMove # different names
|
||||
|
||||
assert action_name in [
|
||||
SupportedActions.Click,
|
||||
SupportedActions.MouseMove,
|
||||
]
|
||||
action = ComputerUseAction(
|
||||
name=action_name,
|
||||
description=plan_action.description,
|
||||
parameters=parameters,
|
||||
)
|
||||
return action
|
||||
|
||||
async def predict(
|
||||
self, state: State, execution_state: ExecutionState
|
||||
) -> tuple[dict, dict]:
|
||||
planer_output: PlannerOutput = self.planner.predict(state, execution_state)
|
||||
plan_action = planer_output.plan_action
|
||||
|
||||
action: ComputerUseAction | None = None
|
||||
step: ComputerUseStep | None = None
|
||||
|
||||
match plan_action.action_type:
|
||||
case PlanActionType.KeyPress:
|
||||
keys = plan_action.parameters["key"].split(" ")
|
||||
keys = [key.strip() for key in keys]
|
||||
keys = [key_maps.get(key, key) for key in keys]
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.KeyPress,
|
||||
description=plan_action.description,
|
||||
parameters={"keys": keys},
|
||||
)
|
||||
case PlanActionType.Wait:
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.Wait,
|
||||
description=plan_action.description,
|
||||
parameters={},
|
||||
)
|
||||
case PlanActionType.ExtractData:
|
||||
# return a step with no action, just to store the extracted data
|
||||
step = ComputerUseStep(
|
||||
description=plan_action.description,
|
||||
actions=[],
|
||||
additional_parameters={
|
||||
"extracted_data": plan_action.parameters,
|
||||
},
|
||||
thought=planer_output.thought,
|
||||
)
|
||||
case PlanActionType.Finish:
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.Finish,
|
||||
description=plan_action.description,
|
||||
parameters=plan_action.parameters,
|
||||
)
|
||||
case (
|
||||
PlanActionType.Click
|
||||
| PlanActionType.MouseMove
|
||||
| PlanActionType.Scroll
|
||||
| PlanActionType.Drag
|
||||
| PlanActionType.DoubleClick
|
||||
| PlanActionType.RightClick
|
||||
):
|
||||
if plan_action.action_type != PlanActionType.Drag:
|
||||
grounding_result = await self.executor.predict(
|
||||
state.image_base64,
|
||||
plan_action.description,
|
||||
action=plan_action.action_type,
|
||||
)
|
||||
else:
|
||||
grounding_result = await self.executor.predict(
|
||||
state.image_base64,
|
||||
plan_action.parameters["start_description"],
|
||||
action=plan_action.action_type,
|
||||
)
|
||||
grounding_result_end = await self.executor.predict(
|
||||
state.image_base64,
|
||||
plan_action.parameters["end_description"],
|
||||
action=plan_action.action_type,
|
||||
)
|
||||
grounding_result.end_position = grounding_result_end.position
|
||||
x, y = grounding_result.position
|
||||
action = self.process_grounding(plan_action, grounding_result, x, y)
|
||||
case PlanActionType.Type:
|
||||
action = ComputerUseAction(
|
||||
name=SupportedActions.TypeInto,
|
||||
description=plan_action.description,
|
||||
parameters={"value": plan_action.parameters["text"]},
|
||||
)
|
||||
|
||||
if step is None:
|
||||
assert action is not None
|
||||
step = ComputerUseStep(
|
||||
description=plan_action.description,
|
||||
actions=[action],
|
||||
additional_parameters={},
|
||||
thought=planer_output.thought,
|
||||
)
|
||||
|
||||
# save additional data for history
|
||||
assert step.additional_parameters is not None
|
||||
step.additional_parameters["thought"] = planer_output.thought
|
||||
step.additional_parameters["review"] = planer_output.review
|
||||
step.additional_parameters.update(planer_output.additional_sections)
|
||||
step.additional_parameters["plan_action"] = json.dumps(plan_action.to_dict())
|
||||
|
||||
history_image = state.image_base64
|
||||
previous_steps_parameters = {
|
||||
"max_chat_history_messages": 1000,
|
||||
"max_chat_history_images": self.planner.number_history_steps_with_images,
|
||||
"image": history_image,
|
||||
}
|
||||
agent_response = {
|
||||
"step": step.to_response_dict(),
|
||||
"previous_steps_parameters": previous_steps_parameters,
|
||||
}
|
||||
|
||||
return agent_response
|
||||
43
mm_agents/uipath/grounder_client.py
Normal file
43
mm_agents/uipath/grounder_client.py
Normal file
@@ -0,0 +1,43 @@
|
||||
import httpx
|
||||
import mm_agents.uipath.utils as utils
|
||||
import os
|
||||
|
||||
class GrounderClient(object):
|
||||
def __init__(self):
|
||||
# Proxy for hosting UI-TARS + UiElementPredictor
|
||||
# Could be replaced with a VLLM server and grounder (UI-TARS) specific processing
|
||||
# Or any other grounder
|
||||
self.url = ""
|
||||
|
||||
async def predict(
|
||||
self, image_base64: str, action_description: str, action: str | None = None
|
||||
) -> utils.GroundingOutput:
|
||||
request = utils.GroundingRequest(
|
||||
description=action_description,
|
||||
image_base64=image_base64,
|
||||
action_type=action,
|
||||
)
|
||||
api_key = os.getenv("SERVICE_KEY")
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
response = await client.post(
|
||||
self.url,
|
||||
json={
|
||||
"image_base64": request.image_base64,
|
||||
"action_description": request.description,
|
||||
"action": request.action_type,
|
||||
},
|
||||
headers={
|
||||
"X-API-KEY": api_key
|
||||
},
|
||||
timeout=100.0,
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
raise ValueError(f"Prediction failed: {response.text}")
|
||||
|
||||
data = response.json()
|
||||
return utils.GroundingOutput(
|
||||
description=data["description"],
|
||||
position=tuple(data["position"]),
|
||||
)
|
||||
BIN
mm_agents/uipath/imgs/element_predictions.png
Normal file
BIN
mm_agents/uipath/imgs/element_predictions.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 798 KiB |
BIN
mm_agents/uipath/imgs/schema.png
Normal file
BIN
mm_agents/uipath/imgs/schema.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 386 KiB |
44
mm_agents/uipath/llm_client.py
Normal file
44
mm_agents/uipath/llm_client.py
Normal file
@@ -0,0 +1,44 @@
|
||||
import os
|
||||
import requests
|
||||
|
||||
def send_messages(payload):
|
||||
# URL to your proxy for calling LLMs
|
||||
proxy_url = ""
|
||||
api_key = os.getenv("SERVICE_KEY")
|
||||
|
||||
# Can be directly replaced with code for calling Azure endpoint as in:
|
||||
#.env config example :
|
||||
# AZURE_OPENAI_API_BASE=YOUR_API_BASE
|
||||
# AZURE_OPENAI_DEPLOYMENT=YOUR_DEPLOYMENT
|
||||
# AZURE_OPENAI_API_VERSION=YOUR_API_VERSION
|
||||
# AZURE_OPENAI_MODEL=gpt-4o-mini
|
||||
# AZURE_OPENAI_API_KEY={{YOUR_API_KEY}}
|
||||
# AZURE_OPENAI_ENDPOINT=${AZURE_OPENAI_API_BASE}/openai/deployments/${AZURE_OPENAI_DEPLOYMENT}/chat/completions?api-version=${AZURE_OPENAI_API_VERSION}
|
||||
|
||||
|
||||
# Load environment variables
|
||||
# load_dotenv()
|
||||
# api_key = os.getenv('AZURE_OPENAI_API_KEY')
|
||||
# openai_endpoint = os.getenv('AZURE_OPENAI_ENDPOINT')
|
||||
# #logger.info("Openai endpoint: %s", openai_endpoint)
|
||||
|
||||
# headers = {
|
||||
# "Content-Type": "application/json",
|
||||
# "api-key": api_key
|
||||
# }
|
||||
# response = requests.post(
|
||||
# openai_endpoint,
|
||||
# headers=headers,
|
||||
# json=payload
|
||||
# )
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"X-API-KEY": api_key
|
||||
}
|
||||
retries = 3
|
||||
for attempt in range(retries):
|
||||
response = requests.post(proxy_url, headers=headers, json=payload)
|
||||
if response.status_code == 200:
|
||||
return response.json()["choices"][0]["message"]["content"]
|
||||
return None
|
||||
194
mm_agents/uipath/types_utils.py
Normal file
194
mm_agents/uipath/types_utils.py
Normal file
@@ -0,0 +1,194 @@
|
||||
from typing import Optional, Union, List
|
||||
from enum import Enum
|
||||
|
||||
key_maps = {
|
||||
"Backspace": "Back",
|
||||
"Ctrl": "Ctrl",
|
||||
"Shift": "Shift",
|
||||
"Tab": "Tab",
|
||||
"Enter": "Enter",
|
||||
"Escape": "Esc",
|
||||
"Arrowleft": "Left",
|
||||
"Arrowup": "Up",
|
||||
"Arrowright": "Right",
|
||||
"Arrowdown": "Down",
|
||||
"Delete": "Del",
|
||||
"Pageup": "PgUp",
|
||||
"Pagedown": "PgDn",
|
||||
}
|
||||
|
||||
|
||||
class PlanActionType(str, Enum):
|
||||
Click = "click"
|
||||
DoubleClick = "double_click"
|
||||
RightClick = "right_click"
|
||||
Type = "type"
|
||||
Scroll = "scroll"
|
||||
Drag = "drag"
|
||||
Wait = "wait"
|
||||
KeyPress = "key_press"
|
||||
MouseMove = "move_mouse"
|
||||
ExtractData = "extract_data"
|
||||
Finish = "finish"
|
||||
|
||||
|
||||
VALID_PLAN_ACTIONS = [action.value for action in PlanActionType]
|
||||
|
||||
|
||||
class PlanAction:
|
||||
def __init__(
|
||||
self, action_type: str, description: str, parameters: dict | None = None
|
||||
):
|
||||
self.action_type = action_type
|
||||
self.description = description
|
||||
self.parameters = parameters if parameters is not None else {}
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"type": self.action_type,
|
||||
"description": self.description,
|
||||
"parameters": self.parameters,
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, data: dict | None) -> Union["PlanAction", None]:
|
||||
if data is None:
|
||||
return None
|
||||
|
||||
action_type = data.get("type", "").lower()
|
||||
|
||||
if action_type not in VALID_PLAN_ACTIONS:
|
||||
raise Exception(f"Invalid action type: {action_type}")
|
||||
|
||||
target_element = data.get("target_element", None)
|
||||
|
||||
action = PlanAction(
|
||||
action_type=action_type,
|
||||
description=data.get("description", ""),
|
||||
parameters=data.get("parameters", {}),
|
||||
)
|
||||
|
||||
if target_element is not None:
|
||||
action.parameters["target_element"] = target_element
|
||||
|
||||
return action
|
||||
|
||||
|
||||
class SupportedActions(str, Enum):
|
||||
Click = "click"
|
||||
TypeInto = "type_into"
|
||||
Scroll = "scroll"
|
||||
Drag = "drag"
|
||||
Wait = "wait_load_completed"
|
||||
KeyPress = "keypress"
|
||||
MouseMove = "mouse_move"
|
||||
Finish = "finish"
|
||||
|
||||
def __str__(self):
|
||||
return self.value
|
||||
|
||||
|
||||
class ComputerUseAction(object):
|
||||
def __init__(
|
||||
self,
|
||||
name: str,
|
||||
description: str,
|
||||
parameters: dict,
|
||||
action_id: str | None = None,
|
||||
result: Optional[dict | str] = None,
|
||||
):
|
||||
self.id = action_id
|
||||
self.name = name
|
||||
self.parameters = parameters
|
||||
self.description = description
|
||||
self.result = result
|
||||
|
||||
@staticmethod
|
||||
def from_dict(action_dict: dict):
|
||||
result = action_dict.get("result")
|
||||
if (
|
||||
result is not None
|
||||
and isinstance(result, dict)
|
||||
and "token_usage" in result
|
||||
and "data" in result
|
||||
):
|
||||
result = result["data"]
|
||||
|
||||
return ComputerUseAction(
|
||||
name=action_dict["name"],
|
||||
description=action_dict["description"],
|
||||
result=result,
|
||||
parameters=action_dict.get("parameters", {}),
|
||||
)
|
||||
|
||||
def to_response_dict(self):
|
||||
action_dict = {
|
||||
"description": self.description,
|
||||
"method_type": self.name,
|
||||
"parameters": self.parameters,
|
||||
"id": self.id,
|
||||
}
|
||||
|
||||
if self.result is not None:
|
||||
action_dict["result"] = self.result
|
||||
|
||||
return action_dict
|
||||
|
||||
|
||||
class ComputerUseStep(object):
|
||||
def __init__(
|
||||
self,
|
||||
description: str,
|
||||
actions: List[ComputerUseAction],
|
||||
thought: str | None = None,
|
||||
screen_info: dict | None = None,
|
||||
image: str | None = None,
|
||||
additional_parameters: dict | None = None,
|
||||
):
|
||||
self.description = description
|
||||
self.actions: List[ComputerUseAction] = actions
|
||||
self.thought = thought
|
||||
self.screen_info = screen_info
|
||||
self.additional_parameters = additional_parameters
|
||||
self.image = image
|
||||
|
||||
@staticmethod
|
||||
def from_dict(step_dict):
|
||||
return ComputerUseStep(
|
||||
description=step_dict["description"],
|
||||
thought=step_dict.get("thought"),
|
||||
actions=[
|
||||
ComputerUseAction.from_dict(action_dict)
|
||||
for action_dict in step_dict["actions"]
|
||||
],
|
||||
)
|
||||
|
||||
def to_response_dict(self):
|
||||
response_step = {
|
||||
"description": self.description,
|
||||
"thought": self.thought,
|
||||
"additional_parameters": self.additional_parameters,
|
||||
}
|
||||
response_actions = []
|
||||
|
||||
for action in self.actions:
|
||||
action_dict = action.to_response_dict()
|
||||
response_actions.append(action_dict)
|
||||
if self.image is not None:
|
||||
response_step["image"] = self.image
|
||||
response_step["actions"] = response_actions
|
||||
|
||||
return response_step
|
||||
|
||||
|
||||
class State(object):
|
||||
def __init__(self, task: str, image_base64: str, previous_steps: list):
|
||||
self.task = task
|
||||
self.image_base64 = image_base64
|
||||
self.previous_steps = previous_steps
|
||||
|
||||
|
||||
class ExecutionState(object):
|
||||
def __init__(self, model_name: str, execution_info: dict):
|
||||
self.model_name = model_name
|
||||
self.execution_info = execution_info
|
||||
57
mm_agents/uipath/utils.py
Normal file
57
mm_agents/uipath/utils.py
Normal file
@@ -0,0 +1,57 @@
|
||||
import json
|
||||
import re
|
||||
|
||||
from json_minify import json_minify
|
||||
from json_repair import repair_json
|
||||
|
||||
|
||||
class ValidationException(Exception):
|
||||
def __init__(self, message: str):
|
||||
self.message = message
|
||||
|
||||
|
||||
def parse_message_json(message: str) -> dict:
|
||||
message = message.strip()
|
||||
code_block_pattern = r"```json\s*([\s\S]+?)```"
|
||||
code_block_match = re.search(code_block_pattern, message, re.DOTALL)
|
||||
|
||||
if code_block_match:
|
||||
json_str = code_block_match.group(1).strip()
|
||||
else:
|
||||
bracket_pattern = r"\{.*\}"
|
||||
bracket_match = re.search(bracket_pattern, message, re.DOTALL)
|
||||
if not bracket_match:
|
||||
raise ValidationException("Response does not have correct json format")
|
||||
json_str = bracket_match.group(0).strip()
|
||||
|
||||
try:
|
||||
json_str = json_minify(json_str)
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
try:
|
||||
json_str = repair_json(json_str)
|
||||
data = json.loads(json_str)
|
||||
except json.JSONDecodeError:
|
||||
raise ValidationException("Response does not have correct json format")
|
||||
return data
|
||||
|
||||
|
||||
class GroundingOutput:
|
||||
def __init__(
|
||||
self,
|
||||
description: str,
|
||||
position: tuple[int, int],
|
||||
end_position: tuple[int, int] = None,
|
||||
):
|
||||
self.description = description
|
||||
self.position = position
|
||||
self.end_position = end_position
|
||||
|
||||
|
||||
class GroundingRequest:
|
||||
def __init__(
|
||||
self, description: str, image_base64: str, action_type: str | None = None
|
||||
):
|
||||
self.description = description
|
||||
self.image_base64 = image_base64
|
||||
self.action_type = action_type
|
||||
238
mm_agents/uipath_agent.py
Normal file
238
mm_agents/uipath_agent.py
Normal file
@@ -0,0 +1,238 @@
|
||||
import base64
|
||||
import json
|
||||
from typing import Dict, List
|
||||
import re
|
||||
import asyncio
|
||||
import logging
|
||||
from mm_agents.uipath.agent import UiPathComputerUseV1
|
||||
|
||||
|
||||
def parse_actions_from_string(input_string):
|
||||
if input_string.strip() in ["WAIT", "DONE", "FAIL"]:
|
||||
return [input_string.strip()]
|
||||
actions = []
|
||||
matches = re.findall(r"```json\s+(.*?)\s+```", input_string, re.DOTALL)
|
||||
if matches:
|
||||
try:
|
||||
for match in matches:
|
||||
action_dict = json.loads(match)
|
||||
actions.append(action_dict)
|
||||
return actions
|
||||
except json.JSONDecodeError as e:
|
||||
return f"Failed to parse JSON: {e}"
|
||||
else:
|
||||
matches = re.findall(r"```\s+(.*?)\s+```", input_string, re.DOTALL)
|
||||
if matches:
|
||||
try:
|
||||
for match in matches:
|
||||
action_dict = json.loads(match)
|
||||
actions.append(action_dict)
|
||||
return actions
|
||||
except json.JSONDecodeError as e:
|
||||
return f"Failed to parse JSON: {e}"
|
||||
else:
|
||||
try:
|
||||
action_dict = json.loads(input_string)
|
||||
return [action_dict]
|
||||
except json.JSONDecodeError:
|
||||
raise ValueError("Invalid response format: " + input_string)
|
||||
|
||||
|
||||
def map_key(key):
|
||||
key = key.lower()
|
||||
if key == "space":
|
||||
key = " "
|
||||
elif key == "back":
|
||||
key = "backspace"
|
||||
elif key == "super":
|
||||
key = "win"
|
||||
elif key == "arrowdown":
|
||||
key = "down"
|
||||
elif key == "arrowup":
|
||||
key = "up"
|
||||
elif key == "arrowright":
|
||||
key = "right"
|
||||
elif key == "arrowrleft":
|
||||
key = "left"
|
||||
return key
|
||||
|
||||
|
||||
def map_uipath_agent_actions_to_osworld(actions):
|
||||
results = []
|
||||
|
||||
def handle_click(params):
|
||||
x, y = tuple(params["position"])
|
||||
if "button" in params:
|
||||
if params["button"] == "right":
|
||||
return {"action_type": "RIGHT_CLICK", "x": x, "y": y}
|
||||
elif params["button"] == "left":
|
||||
return {"action_type": "LEFT_CLICK", "x": x, "y": y}
|
||||
else:
|
||||
raise ValueError(f"Unknown click button: {params['button']}")
|
||||
elif "click_type" in params:
|
||||
if params["click_type"] == "double":
|
||||
return {"action_type": "DOUBLE_CLICK", "x": x, "y": y}
|
||||
elif params["click_type"] == "triple":
|
||||
return {"action_type": "TRIPLE_CLICK", "x": x, "y": y}
|
||||
else:
|
||||
raise ValueError(f"Unknown click type: {params['click_type']}")
|
||||
else:
|
||||
return {"action_type": "CLICK", "x": x, "y": y}
|
||||
|
||||
def handle_keypress(params):
|
||||
keys = [map_key(k) for k in params["keys"]]
|
||||
if len(keys) == 1:
|
||||
return {"action_type": "PRESS", "key": keys[0]}
|
||||
return {"action_type": "HOTKEY", "keys": keys}
|
||||
|
||||
def handle_key_event(params, event_type):
|
||||
key = map_key(params["keys"][0])
|
||||
return {"action_type": event_type, "key": key}
|
||||
|
||||
for action in actions:
|
||||
method = action["method_type"].lower()
|
||||
params = action["parameters"]
|
||||
|
||||
match method:
|
||||
case "click":
|
||||
result = handle_click(params)
|
||||
case "type_into":
|
||||
result = {"action_type": "TYPING", "text": params["value"]}
|
||||
case "wait_load_completed":
|
||||
result = "WAIT"
|
||||
case "keypress":
|
||||
result = handle_keypress(params)
|
||||
case "keydown":
|
||||
result = handle_key_event(params, "KEY_DOWN")
|
||||
case "keypup":
|
||||
result = handle_key_event(params, "KEY_UP")
|
||||
case "finish":
|
||||
status_map = {"failure": "FAIL", "success": "DONE"}
|
||||
result = status_map.get(params.get("status"), "DONE")
|
||||
case "scroll":
|
||||
x, y = tuple(params["position"])
|
||||
if "offset" in params:
|
||||
dx, dy = tuple(params["offset"])
|
||||
else:
|
||||
dy = 5 if params["direction"] == "up" else -5
|
||||
dx = 5 if params["direction"] == "left" else -5
|
||||
result = [
|
||||
{"action_type": "MOVE_TO", "x": x, "y": y},
|
||||
{"action_type": "SCROLL", "dx": dx, "dy": dy},
|
||||
]
|
||||
case "mouse_move":
|
||||
x, y = tuple(params["position"])
|
||||
result = {"action_type": "MOVE_TO", "x": x, "y": y}
|
||||
case "drag":
|
||||
path = params["path"]
|
||||
x1, y1 = path[0]["x"], path[0]["y"]
|
||||
x2, y2 = path[1]["x"], path[1]["y"]
|
||||
result = [
|
||||
{"action_type": "MOVE_TO", "x": x1, "y": y1},
|
||||
{"action_type": "DRAG_TO", "x": x2, "y": y2},
|
||||
]
|
||||
case _:
|
||||
raise ValueError(f"Unknown method type: {method}")
|
||||
|
||||
results.append(result)
|
||||
|
||||
return json.dumps(results)
|
||||
|
||||
|
||||
class UipathBaseAgent:
|
||||
def __init__(
|
||||
self,
|
||||
platform="ubuntu",
|
||||
model="gpt-5-mini-2025-08-07",
|
||||
action_space="computer_13",
|
||||
observation_type="screenshot",
|
||||
client_password="password",
|
||||
):
|
||||
self.platform = platform
|
||||
self.model = model
|
||||
self.action_space = action_space
|
||||
self.observation_type = observation_type
|
||||
self.client_password = client_password
|
||||
self.uipath_computer_use_model = UiPathComputerUseV1()
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.uipath_hist = []
|
||||
|
||||
def update_history(self, rsp, img_base64):
|
||||
self.uipath_hist.append(
|
||||
{
|
||||
"actions": rsp["step"]["actions"],
|
||||
"description": rsp["step"]["description"],
|
||||
"additional_parameters": {
|
||||
"review": rsp["step"]["additional_parameters"]["review"],
|
||||
"thought": rsp["step"]["additional_parameters"]["thought"],
|
||||
"action_description": rsp["step"]["additional_parameters"][
|
||||
"action_description"
|
||||
],
|
||||
"plan_action": rsp["step"]["additional_parameters"]["plan_action"],
|
||||
},
|
||||
"image": img_base64,
|
||||
}
|
||||
)
|
||||
|
||||
def predict(self, instruction: str, obs: Dict, args, step_idx) -> List:
|
||||
if step_idx == args.max_steps - 1:
|
||||
message = (
|
||||
instruction
|
||||
+ "The sudo password is password, if needed. This is the last step, you must return the finish actions with either success or failure, depending on the result. No further steps are allowed."
|
||||
)
|
||||
else:
|
||||
message = instruction + "The sudo password is password, if needed."
|
||||
img_base64 = base64.b64encode(obs["screenshot"]).decode("utf-8")
|
||||
payload = {
|
||||
"previousSteps": self.uipath_hist,
|
||||
"userTask": message,
|
||||
"image": img_base64,
|
||||
"model_name": args.uipath_model_name,
|
||||
}
|
||||
rsp = asyncio.run(
|
||||
self.uipath_computer_use_model.predict_request(
|
||||
payload, args.uipath_model_name
|
||||
)
|
||||
)
|
||||
self.update_history(rsp, img_base64)
|
||||
|
||||
uipath_actions = map_uipath_agent_actions_to_osworld(rsp["step"]["actions"])
|
||||
try:
|
||||
actions = self.parse_actions(uipath_actions)
|
||||
self.thoughts.append(rsp)
|
||||
except ValueError as e:
|
||||
print("Failed to parse action from response", e)
|
||||
actions = None
|
||||
self.thoughts.append("")
|
||||
|
||||
if len(actions) != 0:
|
||||
while actions and isinstance(actions[0], list):
|
||||
actions = [
|
||||
action for multi_action in actions for action in multi_action
|
||||
]
|
||||
return rsp["step"], actions
|
||||
|
||||
def parse_actions(self, response: str, masks=None):
|
||||
if self.observation_type in ["screenshot"]:
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
else:
|
||||
raise ValueError("Invalid action space: " + self.action_space)
|
||||
self.actions.append(actions)
|
||||
return actions
|
||||
else:
|
||||
raise ValueError("Invalid observation type: " + self.action_space)
|
||||
|
||||
def reset(self, _logger=None):
|
||||
global logger
|
||||
logger = (
|
||||
_logger if _logger is not None else logging.getLogger("desktopenv.agent")
|
||||
)
|
||||
|
||||
self.thoughts = []
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.uipath_hist = []
|
||||
@@ -506,17 +506,18 @@ class UITARSAgent:
|
||||
if last_action_after_obs is not None and self.infer_mode == "double_image":
|
||||
self.history_images.append(last_action_after_obs["screenshot"])
|
||||
|
||||
self.history_images.append(obs["screenshot"])
|
||||
|
||||
if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
|
||||
base64_image = obs["screenshot"]
|
||||
self.history_images.append(obs["screenshot"])
|
||||
|
||||
if self.observation_type in ["screenshot", "screenshot_a11y_tree", "a11y_tree"]:
|
||||
base64_image = obs["screenshot"] if self.observation_type in ["screenshot", "screenshot_a11y_tree"] else None
|
||||
try:
|
||||
linearized_accessibility_tree = (
|
||||
linearize_accessibility_tree(
|
||||
accessibility_tree=obs["accessibility_tree"],
|
||||
platform=self.platform,
|
||||
)
|
||||
if self.observation_type == "screenshot_a11y_tree"
|
||||
if self.observation_type in ["screenshot_a11y_tree", "a11y_tree"]
|
||||
else None
|
||||
)
|
||||
except:
|
||||
@@ -535,7 +536,14 @@ class UITARSAgent:
|
||||
"accessibility_tree": linearized_accessibility_tree,
|
||||
}
|
||||
)
|
||||
else:
|
||||
elif self.observation_type == "a11y_tree":
|
||||
self.observations.append(
|
||||
{
|
||||
"screenshot": None,
|
||||
"accessibility_tree": linearized_accessibility_tree,
|
||||
}
|
||||
)
|
||||
else: # screenshot
|
||||
self.observations.append(
|
||||
{"screenshot": base64_image, "accessibility_tree": None}
|
||||
)
|
||||
@@ -760,4 +768,4 @@ class UITARSAgent:
|
||||
self.actions = []
|
||||
self.observations = []
|
||||
self.history_images = []
|
||||
self.history_responses = []
|
||||
self.history_responses = []
|
||||
|
||||
@@ -68,3 +68,5 @@ anthropic
|
||||
alibabacloud_ecs20140526
|
||||
alibabacloud_tea_openapi
|
||||
alibabacloud_tea_util
|
||||
json_minify
|
||||
json_repair
|
||||
608
run_autoglm_v.py
Normal file
608
run_autoglm_v.py
Normal file
@@ -0,0 +1,608 @@
|
||||
"""Script to run end-to-end evaluation on the benchmark.
|
||||
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import math
|
||||
import ast
|
||||
import time
|
||||
import backoff
|
||||
import httpx
|
||||
import requests
|
||||
from openai import APIConnectionError, APIError, RateLimitError
|
||||
from requests.exceptions import SSLError
|
||||
from tqdm import tqdm
|
||||
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import MAX_RETRIES, DesktopEnv as DesktopEnvBase
|
||||
from mm_agents.autoglm_v import AutoGLMAgent
|
||||
from typing import Optional, Dict, Any
|
||||
from openai import OpenAI
|
||||
|
||||
# Almost deprecated since it's not multi-env, use run_multienv_*.py instead
|
||||
|
||||
# Logger Configs {{{ #
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(logging.INFO)
|
||||
sdebug_handler.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
sdebug_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
sdebug_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
logger.addHandler(sdebug_handler)
|
||||
# }}} Logger Configs #
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark")
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--path_to_vm", type=str)
|
||||
parser.add_argument(
|
||||
"--provider_name",
|
||||
type=str,
|
||||
default="docker",
|
||||
help="Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)",
|
||||
)
|
||||
parser.add_argument("--headless", action="store_true", default=True, help="Run in headless machine")
|
||||
parser.add_argument("--action_space", type=str, default="autoglm_computer_use", help="Action type")
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="a11y_tree",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--screen_width", type=int, default=1920)
|
||||
parser.add_argument("--screen_height", type=int, default=1080)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=1.0)
|
||||
parser.add_argument("--max_steps", type=int, default=50)
|
||||
|
||||
# agent config
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=3)
|
||||
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples/examples")
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="autoglm-os")
|
||||
parser.add_argument("--temperature", type=float, default=0.4)
|
||||
parser.add_argument("--top_p", type=float, default=0.5)
|
||||
parser.add_argument("--max_tokens", type=int, default=4096)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
parser.add_argument("--image_width", type=int, default=1280)
|
||||
parser.add_argument("--image_height", type=int, default=720)
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json")
|
||||
|
||||
# aws config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--client_password", type=str, default="", help="Client password"
|
||||
)
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
|
||||
class DesktopEnv(DesktopEnvBase):
|
||||
def step(self, action, pause=2):
|
||||
self._step_no += 1
|
||||
self.action_history.append(action)
|
||||
|
||||
# Mark environment as used when step is called
|
||||
self.is_environment_used = True
|
||||
|
||||
reward = 0 # todo: Define reward calculation for each example
|
||||
done = False # todo: Define episode termination condition for each example
|
||||
info = {}
|
||||
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
|
||||
|
||||
# handle the special actions
|
||||
if action in ['WAIT', 'FAIL', 'DONE']:
|
||||
if action == 'WAIT':
|
||||
time.sleep(pause)
|
||||
exe_result = 'Wait ' + str(pause) + ' seconds'
|
||||
elif action == 'FAIL':
|
||||
done = True
|
||||
info = {"fail": True}
|
||||
exe_result = 'Finish: fail'
|
||||
elif action == 'DONE':
|
||||
done = True
|
||||
info = {"done": True}
|
||||
exe_result = 'Finish: success'
|
||||
elif type(action) == dict:
|
||||
if action['action_type'] == 'OPEN_APP':
|
||||
self.setup_controller._launch_setup(action['parameters']['launch_app_command'], shell=True)
|
||||
exe_result = 'Open ' + action['parameters']['app_name']
|
||||
elif action['action_type'] == 'OPEN_CHROME_TAB':
|
||||
self.setup_controller._chrome_open_tabs_setup(action['parameters']['urls_to_open'])
|
||||
exe_result = 'Open ' + str(action['parameters']['urls_to_open']) + ' in Chrome successfully'
|
||||
else:
|
||||
# the set of all possible python commands insides `pyautogui`
|
||||
result = self.controller.execute_python_command(action)
|
||||
try:
|
||||
if result['error']:
|
||||
exe_result = result['error'].strip()
|
||||
else:
|
||||
exe_result = result['output'].strip()
|
||||
except Exception as e:
|
||||
exe_result = 'Error Action: ' + action
|
||||
logger.error(f"Error executing action: {e}")
|
||||
|
||||
time.sleep(pause)
|
||||
observation = self._get_obs()
|
||||
observation['exe_result'] = exe_result
|
||||
|
||||
return observation, reward, done, info
|
||||
|
||||
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
|
||||
# Reset to certain task in OSWorld
|
||||
logger.info("Resetting environment...")
|
||||
logger.info("Switching task...")
|
||||
logger.info("Setting counters...")
|
||||
self._traj_no += 1
|
||||
self._step_no = 0
|
||||
self.action_history.clear()
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
# Only revert to snapshot if environment has been used (step/setup)
|
||||
# This optimization is especially important for cloud providers like AWS
|
||||
# where unnecessary snapshot operations are costly and time-consuming
|
||||
|
||||
if task_config is not None:
|
||||
# Only consider task proxy requirement if proxy is enabled at system level
|
||||
task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
|
||||
if not self.enable_proxy and task_config.get("proxy", False):
|
||||
logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
|
||||
|
||||
if task_use_proxy != self.current_use_proxy:
|
||||
# keep because get_info_from_website depend on this
|
||||
self.current_use_proxy = task_use_proxy
|
||||
|
||||
if self.is_environment_used:
|
||||
logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
|
||||
self._revert_to_snapshot()
|
||||
logger.info("Starting emulator...")
|
||||
self._start_emulator()
|
||||
logger.info("Emulator started.")
|
||||
# Reset the usage flag after reverting
|
||||
self.is_environment_used = False
|
||||
else:
|
||||
logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
|
||||
|
||||
if task_config is not None:
|
||||
if task_config.get("proxy", False) and self.enable_proxy:
|
||||
# If using proxy and proxy is enabled, set up the proxy configuration
|
||||
self.setup_controller._proxy_setup(self.client_password)
|
||||
self._set_task_info(task_config)
|
||||
self.setup_controller.reset_cache_dir(self.cache_dir)
|
||||
logger.info("Setting up environment...")
|
||||
success = self.setup_controller.setup(self.config, task_config.get("proxy", False) and self.enable_proxy)
|
||||
if success:
|
||||
# Mark environment as used when setup is successfully executed
|
||||
if self.config: # Only mark as used if there were actual setup operations
|
||||
self.is_environment_used = True
|
||||
break
|
||||
else:
|
||||
logger.error(
|
||||
"Environment setup failed, retrying (%d/%d)...",
|
||||
attempt + 1,
|
||||
MAX_RETRIES,
|
||||
)
|
||||
time.sleep(5)
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info("Environment setup complete.")
|
||||
|
||||
# Upload tools from autoglm package
|
||||
import mm_agents.autoglm_v
|
||||
tool_dir = os.path.join(os.path.dirname(mm_agents.autoglm_v.__file__), 'tools', 'package')
|
||||
for file in os.listdir(tool_dir):
|
||||
if os.path.isdir(os.path.join(tool_dir, file)):
|
||||
continue
|
||||
self.setup_controller._upload_file_setup([{
|
||||
"local_path": os.path.join(tool_dir, file),
|
||||
"path": os.path.join('~', file)
|
||||
}])
|
||||
|
||||
# start soffice service for office tools
|
||||
self.setup_controller._launch_setup('soffice --accept="socket,host=localhost,port=2002;urp;" --norestore --nologo --nodefault', shell=True)
|
||||
time.sleep(5)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def get_current_apps(self):
|
||||
apps_code = r"""import subprocess;
|
||||
command = "wmctrl -xl";
|
||||
apps = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip().split('\n');
|
||||
print(apps);"""
|
||||
window_code = r"""import subprocess;
|
||||
command = "wmctrl -a :ACTIVE: -v 2>&1 | grep 'Using window' | awk '{print $3}'";
|
||||
window_id = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip();
|
||||
print(window_id);"""
|
||||
|
||||
apps = self.controller.execute_python_command(apps_code)['output'].strip()
|
||||
apps = ast.literal_eval(apps)
|
||||
app_list = {}
|
||||
|
||||
for app in apps:
|
||||
parts = app.split(maxsplit=4)
|
||||
if len(parts) < 4:
|
||||
continue
|
||||
if parts[1] != '0':
|
||||
continue
|
||||
window_id = parts[0]
|
||||
app_name = '.'.join(parts[2].split('.')[-(math.ceil(parts[2].count('.') / 2)):])
|
||||
title = parts[3]
|
||||
app_list[window_id] = {
|
||||
'app_name': app_name,
|
||||
'title': title
|
||||
}
|
||||
|
||||
cur_id = self.controller.execute_python_command(window_code)['output'].strip()
|
||||
|
||||
return app_list, cur_id
|
||||
|
||||
def maximize_window(self):
|
||||
window_state = r"""import subprocess;
|
||||
command = "xprop -id $(xprop -root _NET_ACTIVE_WINDOW | awk -F' ' '{print $5}') _NET_WM_STATE"
|
||||
output = subprocess.run(command, shell=True, capture_output=True, text=True).stdout.strip();
|
||||
print(output);"""
|
||||
for _ in range(5):
|
||||
try:
|
||||
self.setup_controller._launch_setup('wmctrl -r :ACTIVE: -b add,maximized_vert,maximized_horz', shell=True)
|
||||
time.sleep(2)
|
||||
output = self.controller.execute_python_command(window_state)['output'].strip()
|
||||
if '_NET_WM_STATE_FOCUSED' not in output or '_NET_WM_STATE_SKIP_TASKBAR' in output or '_NET_WM_STATE_MODAL' in output or '_NET_WM_STATE_MAXIMIZED' in output: # 没有窗口 or popups or 模态窗口 or 窗口已经最大化
|
||||
return
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to maximize window: {e}")
|
||||
time.sleep(1)
|
||||
|
||||
def _get_obs(self):
|
||||
tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools"
|
||||
}
|
||||
|
||||
self.maximize_window()
|
||||
|
||||
for i in range(3):
|
||||
try:
|
||||
app_list, cur_id = self.get_current_apps()
|
||||
except Exception as e:
|
||||
if i == 2:
|
||||
raise e
|
||||
logger.error(f"Failed to get current apps: {e}")
|
||||
time.sleep(1)
|
||||
|
||||
if cur_id in app_list:
|
||||
cur_app = app_list[cur_id]['app_name']
|
||||
|
||||
tool_name = cur_app.strip().lower().replace('-', '_')
|
||||
if tool_name in tool_list:
|
||||
class_name = tool_list[tool_name]
|
||||
command = f"from {tool_name} import *; "
|
||||
command += f"{class_name}.env_info(); "
|
||||
command += f"{class_name}.print_result();"
|
||||
app_info = self.controller.execute_python_command(command)['output'].strip()
|
||||
else:
|
||||
app_info = None
|
||||
else:
|
||||
cur_app = None
|
||||
app_info = None
|
||||
|
||||
tree = self.controller.get_accessibility_tree()
|
||||
screenshot = self.controller.get_screenshot()
|
||||
if screenshot is None:
|
||||
logger.error("Failed to get screenshot.")
|
||||
screenshot = b''
|
||||
|
||||
return {
|
||||
"screenshot": screenshot,
|
||||
"accessibility_tree": tree,
|
||||
"instruction": self.instruction,
|
||||
"apps": app_list,
|
||||
"cur_window_id": cur_id,
|
||||
"cur_app": cur_app,
|
||||
"app_info": app_info,
|
||||
}
|
||||
|
||||
|
||||
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
|
||||
scores = []
|
||||
max_steps = args.max_steps
|
||||
|
||||
# log args
|
||||
logger.info("Args: %s", args)
|
||||
# set wandb project
|
||||
cfg_args = {
|
||||
"path_to_vm": args.path_to_vm,
|
||||
"provider_name": args.provider_name,
|
||||
"headless": args.headless,
|
||||
"action_space": args.action_space,
|
||||
"observation_type": args.observation_type,
|
||||
"screen_width": args.screen_width,
|
||||
"screen_height": args.screen_height,
|
||||
"sleep_after_execution": args.sleep_after_execution,
|
||||
"max_steps": args.max_steps,
|
||||
"max_trajectory_length": args.max_trajectory_length,
|
||||
"model": args.model,
|
||||
"temperature": args.temperature,
|
||||
"top_p": args.top_p,
|
||||
"max_tokens": args.max_tokens,
|
||||
"stop_token": args.stop_token,
|
||||
"result_dir": args.result_dir,
|
||||
}
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.constant,
|
||||
(RateLimitError, APIConnectionError),
|
||||
interval=0.1,
|
||||
)
|
||||
def call_llm(messages):
|
||||
logger.info("Calling LLM...")
|
||||
|
||||
# Prepare the request data
|
||||
data = {
|
||||
"model": args.model,
|
||||
"messages": messages,
|
||||
"max_tokens": args.max_tokens,
|
||||
"temperature": args.temperature,
|
||||
"top_p": args.top_p,
|
||||
"skip_special_tokens": False,
|
||||
"stream": False,
|
||||
"include_stop_str_in_output": True,
|
||||
"stop": ["<|user|>", "<|observation|>", "</answer>"]
|
||||
}
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', '')}"
|
||||
}
|
||||
|
||||
# Get API base URL from environment or use default
|
||||
base_url = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
|
||||
url = f"{base_url}/chat/completions"
|
||||
|
||||
response = requests.post(
|
||||
url,
|
||||
json=data,
|
||||
headers=headers,
|
||||
timeout=60.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
logger.info("LLM called successfully.")
|
||||
return result['choices'][0]['message']['content']
|
||||
|
||||
env = DesktopEnv(
|
||||
provider_name=args.provider_name,
|
||||
region=args.region,
|
||||
client_password=args.client_password,
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
screen_size=(args.screen_width, args.screen_height),
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
)
|
||||
agent = AutoGLMAgent(
|
||||
action_space=args.action_space,
|
||||
observation_type=args.observation_type,
|
||||
screen_size=(args.screen_width, args.screen_height),
|
||||
image_size=(args.image_width, args.image_height),
|
||||
max_trajectory_length=args.max_trajectory_length,
|
||||
client_password=args.client_password,
|
||||
gen_func=call_llm,
|
||||
)
|
||||
|
||||
for domain in tqdm(test_all_meta, desc="Domain"):
|
||||
for example_id in tqdm(test_all_meta[domain], desc="Example", leave=False):
|
||||
config_file = os.path.join(args.test_config_base_dir, f"{domain}/{example_id}.json")
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
|
||||
logger.info(f"[Domain]: {domain}")
|
||||
logger.info(f"[Example ID]: {example_id}")
|
||||
|
||||
instruction = example["instruction"]
|
||||
|
||||
logger.info(f"[Instruction]: {instruction}")
|
||||
# wandb each example config settings
|
||||
cfg_args["instruction"] = instruction
|
||||
cfg_args["start_time"] = datetime.datetime.now().strftime("%Y:%m:%d-%H:%M:%S")
|
||||
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
# example start running
|
||||
try:
|
||||
lib_run_single.run_single_example_autoglm(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
max_steps,
|
||||
instruction,
|
||||
args,
|
||||
example_result_dir,
|
||||
scores,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Exception in {domain}/{example_id}: {e}")
|
||||
# Only attempt to end recording if controller exists (not Docker provider)
|
||||
if hasattr(env, "controller") and env.controller is not None:
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({"Error": f"Time limit exceeded in {domain}/{example_id}"}))
|
||||
f.write("\n")
|
||||
|
||||
env.close()
|
||||
logger.info(f"Average score: {sum(scores) / len(scores)}")
|
||||
|
||||
|
||||
def get_unfinished(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
finished[domain] = []
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
if example_id == "onboard":
|
||||
continue
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" not in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
for file in os.listdir(example_path):
|
||||
os.remove(os.path.join(example_path, file))
|
||||
else:
|
||||
finished[domain].append(example_id)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [x for x in total_file_json[domain] if x not in examples]
|
||||
|
||||
return total_file_json
|
||||
|
||||
|
||||
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
if not os.path.exists(target_dir):
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
|
||||
all_result = []
|
||||
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" in os.listdir(example_path):
|
||||
result_path = os.path.join(example_path, "result.txt")
|
||||
try:
|
||||
with open(result_path, "r") as rf:
|
||||
res = rf.read().strip()
|
||||
if res.lower() == "true":
|
||||
score = 1.0
|
||||
else:
|
||||
score = float(res)
|
||||
except Exception:
|
||||
score = 0.0
|
||||
all_result.append(score)
|
||||
|
||||
if not all_result:
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
else:
|
||||
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
|
||||
return all_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
if args.client_password == "":
|
||||
if args.provider_name == "aws":
|
||||
args.client_password = "osworld-public-evaluation"
|
||||
else:
|
||||
args.client_password = "password"
|
||||
else:
|
||||
args.client_password = args.client_password
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test(args, test_file_list)
|
||||
@@ -31,7 +31,7 @@ if MAESTRO_ENV_PATH.exists():
|
||||
load_dotenv(dotenv_path=MAESTRO_ENV_PATH)
|
||||
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
logger.setLevel(logging.INFO)
|
||||
|
||||
vm_datetime_str: str = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
|
||||
@@ -51,9 +51,9 @@ sdebug_handler = logging.FileHandler(
|
||||
)
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
debug_handler.setLevel(logging.INFO)
|
||||
stdout_handler.setLevel(logging.INFO)
|
||||
sdebug_handler.setLevel(logging.DEBUG)
|
||||
sdebug_handler.setLevel(logging.INFO)
|
||||
|
||||
# Safe logging filter
|
||||
safe_filter = SafeLoggingFilter()
|
||||
|
||||
294
run_multienv_autoglm_v.py
Normal file
294
run_multienv_autoglm_v.py
Normal file
@@ -0,0 +1,294 @@
|
||||
"""Script to run end-to-end evaluation on the benchmark.
|
||||
Utils and basic architecture credit to https://github.com/web-arena-x/webarena/blob/main/run.py.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import math
|
||||
import ast
|
||||
import time
|
||||
import backoff
|
||||
import httpx
|
||||
import requests
|
||||
from tqdm import tqdm
|
||||
from typing import Optional, Dict, Any
|
||||
from multiprocessing import Pool
|
||||
from openai import APIConnectionError, APIError, RateLimitError
|
||||
from types import SimpleNamespace
|
||||
|
||||
import lib_run_single
|
||||
from run_autoglm_v import DesktopEnv, get_unfinished, get_result
|
||||
from desktop_env.desktop_env import MAX_RETRIES, DesktopEnv as DesktopEnvBase
|
||||
from mm_agents.autoglm_v import AutoGLMAgent
|
||||
from openai import OpenAI
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(description="Run end-to-end evaluation on the benchmark")
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--path_to_vm", type=str)
|
||||
parser.add_argument(
|
||||
"--provider_name",
|
||||
type=str,
|
||||
default="docker",
|
||||
help="Virtualization provider (vmware, docker, aws, azure, gcp, virtualbox)",
|
||||
)
|
||||
parser.add_argument("--headless", action="store_true", default=True, help="Run in headless machine")
|
||||
parser.add_argument("--action_space", type=str, default="autoglm_computer_use", help="Action type")
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="a11y_tree",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--screen_width", type=int, default=1920)
|
||||
parser.add_argument("--screen_height", type=int, default=1080)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=1.0)
|
||||
parser.add_argument("--max_steps", type=int, default=30)
|
||||
|
||||
# agent config
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=3)
|
||||
parser.add_argument("--test_config_base_dir", type=str, default="evaluation_examples/examples")
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="autoglm-os")
|
||||
parser.add_argument("--temperature", type=float, default=0.4)
|
||||
parser.add_argument("--top_p", type=float, default=0.5)
|
||||
parser.add_argument("--max_tokens", type=int, default=2048)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
parser.add_argument("--image_width", type=int, default=1280)
|
||||
parser.add_argument("--image_height", type=int, default=720)
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument("--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json")
|
||||
|
||||
# aws config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument("--client_password", type=str, default="", help="Client password")
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
|
||||
# parallel number
|
||||
parser.add_argument("--num_workers", type=int, default=20, help="Number of parallel workers")
|
||||
args = parser.parse_args()
|
||||
|
||||
return args
|
||||
|
||||
def _worker_run(task):
|
||||
domain, example_id, args = task # args 为 argparse.Namespace
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
try:
|
||||
config_file = os.path.join(args.test_config_base_dir, f"{domain}/{example_id}.json")
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
instruction = example["instruction"]
|
||||
|
||||
@backoff.on_exception(backoff.constant, (RateLimitError, APIConnectionError), interval=0.1)
|
||||
def call_llm(messages):
|
||||
logger.info("Calling LLM...")
|
||||
|
||||
# Prepare the request data
|
||||
data = {
|
||||
"model": args.model,
|
||||
"messages": messages,
|
||||
"max_tokens": args.max_tokens,
|
||||
"temperature": args.temperature,
|
||||
"top_p": args.top_p,
|
||||
"skip_special_tokens": False,
|
||||
"stream": False,
|
||||
"include_stop_str_in_output": True,
|
||||
"stop": ["<|user|>", "<|observation|>", "</answer>"]
|
||||
}
|
||||
|
||||
# Set up proxy
|
||||
# if os.environ.get('LAN_PROXY', None):
|
||||
# proxies = {
|
||||
# "http": os.environ.get('LAN_PROXY'),
|
||||
# "https": os.environ.get('LAN_PROXY')
|
||||
# }
|
||||
# else:
|
||||
# proxies = None
|
||||
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY', '')}"
|
||||
}
|
||||
|
||||
# Get API base URL from environment or use default
|
||||
base_url = os.environ.get('OPENAI_BASE_URL', 'https://api.openai.com/v1')
|
||||
url = f"{base_url}/chat/completions"
|
||||
|
||||
response = requests.post(
|
||||
url,
|
||||
json=data,
|
||||
headers=headers,
|
||||
# proxies=proxies,
|
||||
timeout=60.0
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
result = response.json()
|
||||
logger.info("LLM called successfully.")
|
||||
return result['choices'][0]['message']['content']
|
||||
|
||||
env = DesktopEnv(
|
||||
provider_name=args.provider_name,
|
||||
region=args.region,
|
||||
client_password=args.client_password,
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
screen_size=(args.screen_width, args.screen_height),
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
)
|
||||
agent = AutoGLMAgent(
|
||||
action_space=args.action_space,
|
||||
observation_type=args.observation_type,
|
||||
screen_size=(args.screen_width, args.screen_height),
|
||||
image_size=(args.image_width, args.image_height),
|
||||
max_trajectory_length=args.max_trajectory_length,
|
||||
client_password=args.client_password,
|
||||
gen_func=call_llm,
|
||||
)
|
||||
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
|
||||
local_scores = []
|
||||
try:
|
||||
lib_run_single.run_single_example_autoglm(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
args.max_steps,
|
||||
instruction,
|
||||
args,
|
||||
example_result_dir,
|
||||
local_scores,
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"[并发任务异常] {domain}/{example_id}: {e}")
|
||||
if hasattr(env, "controller") and env.controller is not None:
|
||||
try:
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
except Exception:
|
||||
pass
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({"Error": f"Exception in {domain}/{example_id}: {str(e)}"}) + "\n")
|
||||
finally:
|
||||
try:
|
||||
env.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
score = None
|
||||
result_path = os.path.join(example_result_dir, "result.txt")
|
||||
if os.path.exists(result_path):
|
||||
try:
|
||||
with open(result_path, "r") as rf:
|
||||
res = rf.read().strip()
|
||||
if res.lower() == "true":
|
||||
score = 1.0
|
||||
else:
|
||||
score = float(res)
|
||||
except Exception:
|
||||
score = 0.0
|
||||
else:
|
||||
score = 0.0
|
||||
logger.info(f"[Finish] {domain}/{example_id} score={score}")
|
||||
return (domain, example_id, score)
|
||||
except Exception as e:
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
logger.error(f"[Initializing Fail] {domain}/{example_id}: {e}")
|
||||
return (domain, example_id, 0.0)
|
||||
|
||||
def test_parallel(args: argparse.Namespace, test_all_meta: dict):
|
||||
tasks = []
|
||||
for domain in test_all_meta:
|
||||
for example_id in test_all_meta[domain]:
|
||||
tasks.append((domain, example_id, args))
|
||||
if not tasks:
|
||||
logger.info("No pending tasks")
|
||||
return
|
||||
logger.info(f"Starting parallel execution: {args.num_workers} processes, {len(tasks)} tasks total")
|
||||
|
||||
results = []
|
||||
with Pool(processes=args.num_workers) as pool:
|
||||
for res in tqdm(pool.imap_unordered(_worker_run, tasks), total=len(tasks), desc="Parallel execution"):
|
||||
results.append(res)
|
||||
|
||||
scores = [s for (_, _, s) in results if s is not None]
|
||||
if scores:
|
||||
avg = sum(scores) / len(scores)
|
||||
logger.info(f"Parallel execution completed. Average score: {avg}")
|
||||
else:
|
||||
logger.info("No scores obtained.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
args = config()
|
||||
if args.client_password == "":
|
||||
if args.provider_name == "aws":
|
||||
args.client_password = "osworld-public-evaluation"
|
||||
else:
|
||||
args.client_password = "password"
|
||||
else:
|
||||
args.client_password = args.client_password
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test_parallel(args, test_file_list)
|
||||
740
run_multienv_aworldguiagent.py
Normal file
740
run_multienv_aworldguiagent.py
Normal file
@@ -0,0 +1,740 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import time
|
||||
from typing import List, Dict, Any, Optional
|
||||
import math
|
||||
from tqdm import tqdm
|
||||
from multiprocessing import Process, Manager
|
||||
from multiprocessing import current_process
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import DesktopEnv, _fix_pyautogui_less_than_bug
|
||||
from mm_agents.aworldguiagent.agent import AworldGUIAgent
|
||||
from mm_agents.aworldguiagent.grounding import OSWorldACI
|
||||
|
||||
MAX_RETRIES = 5 # Maximum retries for environment setup
|
||||
|
||||
# Global variables for signal handling
|
||||
active_environments = []
|
||||
processes = []
|
||||
is_terminating = False
|
||||
|
||||
# import wandb
|
||||
|
||||
# load the environment variables from .env file
|
||||
if os.path.exists(".env"):
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Logger Configs {{{ #
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run end-to-end evaluation on the benchmark"
|
||||
)
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--path_to_vm", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--headless", action="store_true", help="Run in headless machine"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--action_space", type=str, default="pyautogui", help="Action type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="screenshot",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=0.0)
|
||||
parser.add_argument("--max_steps", type=int, default=15)
|
||||
|
||||
# agent config
|
||||
parser.add_argument(
|
||||
"--test_config_base_dir", type=str, default="evaluation_examples"
|
||||
)
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="o3")
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument(
|
||||
"--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
|
||||
)
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
|
||||
parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
||||
default='INFO', help="Set the logging level")
|
||||
# aws config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider_name", type=str, default="aws", choices=["aws", "virtualbox", "vmware", "docker", "azure"],
|
||||
help="Provider name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--client_password", type=str, default="", help="Client password"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_width", type=int, default=1920, help="Screen width"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_height", type=int, default=1080, help="Screen height"
|
||||
)
|
||||
|
||||
# agent S2 config
|
||||
|
||||
parser.add_argument("--model_provider", type=str, default="openai")
|
||||
parser.add_argument(
|
||||
"--model_url",
|
||||
type=str,
|
||||
default="",
|
||||
help="The URL of the main generation model API.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model_api_key",
|
||||
type=str,
|
||||
default="",
|
||||
help="The API key of the main generation model.",
|
||||
)
|
||||
parser.add_argument("--model_temperature", type=float, default=None,
|
||||
help="Temperature to fix the generation model at (e.g. o3 can only be run with 1.0)")
|
||||
|
||||
parser.add_argument("--ground_provider", type=str, required=True, help="The provider for the grounding model")
|
||||
parser.add_argument("--ground_url", type=str, required=True, help="The URL of the grounding model")
|
||||
parser.add_argument(
|
||||
"--ground_api_key",
|
||||
type=str,
|
||||
default="",
|
||||
help="The API key of the grounding model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--ground_model", type=str, required=True, help="The model name for the grounding model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--grounding_width",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Width of screenshot image after processor rescaling",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--grounding_height",
|
||||
type=int,
|
||||
required=True,
|
||||
help="Height of screenshot image after processor rescaling",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
args = config() # Get command line arguments first
|
||||
|
||||
logger = logging.getLogger()
|
||||
log_level = getattr(logging, args.log_level.upper())
|
||||
logger.setLevel(log_level)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(
|
||||
os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
debug_handler = logging.FileHandler(
|
||||
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(log_level)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
# }}} Logger Configs #
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
class CustomDesktopEnv(DesktopEnv):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
logger.info("CustomDesktopEnv class initialized.")
|
||||
|
||||
def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
|
||||
|
||||
# Reset to certain task in OSWorld
|
||||
logger.info("Resetting environment...")
|
||||
logger.info("Switching task...")
|
||||
logger.info("Setting counters...")
|
||||
self._traj_no += 1
|
||||
self._step_no = 0
|
||||
self.action_history.clear()
|
||||
|
||||
for attempt in range(MAX_RETRIES):
|
||||
# Only revert to snapshot if environment has been used (step/setup)
|
||||
# This optimization is especially important for cloud providers like AWS
|
||||
# where unnecessary snapshot operations are costly and time-consuming
|
||||
|
||||
if task_config is not None:
|
||||
# Only consider task proxy requirement if proxy is enabled at system level
|
||||
task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
|
||||
if not self.enable_proxy and task_config.get("proxy", False):
|
||||
logger.info(
|
||||
"Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
|
||||
|
||||
if task_use_proxy != self.current_use_proxy:
|
||||
# keep because get_info_from_website depend on this
|
||||
self.current_use_proxy = task_use_proxy
|
||||
|
||||
if self.is_environment_used:
|
||||
logger.info("Environment has been used, reverting to snapshot {}...".format(self.snapshot_name))
|
||||
self._revert_to_snapshot()
|
||||
logger.info("Starting emulator...")
|
||||
self._start_emulator()
|
||||
logger.info("Emulator started.")
|
||||
# Reset the usage flag after reverting
|
||||
self.is_environment_used = False
|
||||
else:
|
||||
logger.info("Environment is clean, skipping snapshot revert (provider: {}).".format(self.provider_name))
|
||||
|
||||
if task_config is not None:
|
||||
if task_config.get("proxy", False) and self.enable_proxy:
|
||||
# If using proxy and proxy is enabled, set up the proxy configuration
|
||||
self.setup_controller._proxy_setup(self.client_password)
|
||||
self._set_task_info(task_config)
|
||||
self.setup_controller.reset_cache_dir(self.cache_dir)
|
||||
logger.info("Clearing browser cache and browsing data...")
|
||||
try:
|
||||
self.setup_controller._delete_all_browsing_data_chromium_setup()
|
||||
logger.info("Browser cache cleared successfully")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to clear browser cache: {e}")
|
||||
logger.info("Setting up environment...")
|
||||
success = self.setup_controller.setup(self.config,
|
||||
task_config.get("proxy", False) and self.enable_proxy)
|
||||
if success:
|
||||
# Mark environment as used when setup is successfully executed
|
||||
if self.config: # Only mark as used if there were actual setup operations
|
||||
self.is_environment_used = True
|
||||
break
|
||||
else:
|
||||
logger.error(
|
||||
"Environment setup failed, retrying (%d/%d)...",
|
||||
attempt + 1,
|
||||
MAX_RETRIES,
|
||||
)
|
||||
time.sleep(5)
|
||||
else:
|
||||
break
|
||||
|
||||
logger.info("Environment setup complete.")
|
||||
|
||||
# start soffice service for office tools
|
||||
self.setup_controller._launch_setup(
|
||||
'soffice --headless --accept="socket,host=localhost,port=2002;urp;" --norestore --nologo --nodefault', shell=True)
|
||||
time.sleep(5)
|
||||
|
||||
observation = self._get_obs()
|
||||
return observation
|
||||
|
||||
def step(self, action, pause=2):
|
||||
self._step_no += 1
|
||||
self.action_history.append(action)
|
||||
|
||||
# Mark environment as used when step is called
|
||||
self.is_environment_used = True
|
||||
|
||||
reward = 0 # todo: Define reward calculation for each example
|
||||
done = False # todo: Define episode termination condition for each example
|
||||
response = None
|
||||
info = {}
|
||||
logger.info(f"Step {self._step_no} in trajectory {self._traj_no} with action: {action}")
|
||||
# handle the special actions
|
||||
if action in ['WAIT', 'FAIL', 'DONE'] or (
|
||||
type(action) == dict and action['action_type'] in ['WAIT', 'FAIL', 'DONE']):
|
||||
if action == 'WAIT':
|
||||
time.sleep(pause)
|
||||
elif action == 'FAIL':
|
||||
done = True
|
||||
info = {"fail": True}
|
||||
elif action == 'DONE':
|
||||
done = True
|
||||
info = {"done": True}
|
||||
|
||||
if self.action_space == "computer_13":
|
||||
# the set of all possible actions defined in the action representation
|
||||
self.controller.execute_action(action)
|
||||
elif self.action_space == "pyautogui" or self.action_space == "claude_computer_use":
|
||||
if action in ['WAIT', 'FAIL', 'DONE']:
|
||||
self.controller.execute_action(action)
|
||||
else:
|
||||
# the set of all possible python commands insides `pyautogui`
|
||||
if type(action) == str:
|
||||
# Fix PyAutoGUI '<' character bug before execution
|
||||
fixed_command = _fix_pyautogui_less_than_bug(action)
|
||||
response = self.controller.execute_python_command(fixed_command)
|
||||
|
||||
elif type(action) == dict:
|
||||
# Fix PyAutoGUI '<' character bug before execution
|
||||
fixed_command = _fix_pyautogui_less_than_bug(action['command'])
|
||||
response = self.controller.execute_python_command(fixed_command)
|
||||
|
||||
time.sleep(pause)
|
||||
observation = self._get_obs()
|
||||
observation["action_response"] = response
|
||||
return observation, reward, done, info
|
||||
|
||||
|
||||
def distribute_tasks(test_all_meta: dict) -> List[tuple]:
|
||||
all_tasks = []
|
||||
for domain, examples in test_all_meta.items():
|
||||
for example_id in examples:
|
||||
all_tasks.append((domain, example_id))
|
||||
return all_tasks
|
||||
|
||||
|
||||
def process_signal_handler(signum, frame, env_idx):
|
||||
"""Signal handler for child processes to gracefully shut down their environments."""
|
||||
logger.info(f"Process {env_idx + 1} received signal {signum}. Shutting down...")
|
||||
|
||||
# Get the active_environments from the caller's frame
|
||||
local_vars = frame.f_locals
|
||||
active_environments = local_vars.get('active_environments', [])
|
||||
|
||||
# Close environment in the current process context
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info(f"Process {env_idx + 1} closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Process {env_idx + 1} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Process {env_idx + 1} error closing environment: {e}")
|
||||
|
||||
logger.info(f"Process {env_idx + 1} shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: list):
|
||||
active_environments = []
|
||||
env = None
|
||||
try:
|
||||
from desktop_env.providers.aws.manager import IMAGE_ID_MAP
|
||||
REGION = args.region
|
||||
screen_size = (args.screen_width, args.screen_height)
|
||||
ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)])
|
||||
env = CustomDesktopEnv(
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
provider_name=args.provider_name,
|
||||
region=REGION,
|
||||
# snapshot_name=ami_id,
|
||||
screen_size=screen_size,
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
enable_proxy=False,
|
||||
client_password=args.client_password
|
||||
)
|
||||
active_environments.append(env)
|
||||
|
||||
# AgentS2 configuration
|
||||
engine_params = {
|
||||
"engine_type": args.model_provider,
|
||||
"model": args.model,
|
||||
"base_url": getattr(args, 'model_url', ''),
|
||||
"api_key": getattr(args, 'model_api_key', ''),
|
||||
"temperature": getattr(args, 'model_temperature', None),
|
||||
}
|
||||
|
||||
|
||||
engine_params_for_grounding = {
|
||||
"engine_type": args.ground_provider,
|
||||
"model": args.ground_model,
|
||||
"base_url": getattr(args, 'ground_url', ''),
|
||||
"api_key": getattr(args, 'ground_api_key', ''),
|
||||
"grounding_width": args.grounding_width,
|
||||
"grounding_height": args.grounding_height,
|
||||
}
|
||||
|
||||
# Create grounding agent
|
||||
grounding_agent = OSWorldACI(
|
||||
platform="linux",
|
||||
engine_params_for_generation=engine_params,
|
||||
engine_params_for_grounding=engine_params_for_grounding,
|
||||
width=args.screen_width,
|
||||
height=args.screen_height,
|
||||
)
|
||||
|
||||
# Create AgentS2 worker
|
||||
agent = AworldGUIAgent(
|
||||
engine_params,
|
||||
grounding_agent,
|
||||
platform="linux",
|
||||
)
|
||||
|
||||
logger.info(f"Process {current_process().name} started.")
|
||||
while True:
|
||||
try:
|
||||
item = task_queue.get(timeout=5)
|
||||
except Exception:
|
||||
break
|
||||
domain, example_id = item
|
||||
try:
|
||||
config_file = os.path.join(
|
||||
args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
|
||||
)
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
logger.info(f"[{current_process().name}][Domain]: {domain}")
|
||||
logger.info(f"[{current_process().name}][Example ID]: {example_id}")
|
||||
logger.info(f"[{current_process().name}][Instruction]: {example['instruction']}")
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
try:
|
||||
lib_run_single.run_single_example(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
args.max_steps,
|
||||
example["instruction"],
|
||||
args,
|
||||
example_result_dir,
|
||||
shared_scores,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
env.controller.end_recording(
|
||||
os.path.join(example_result_dir, "recording.mp4")
|
||||
)
|
||||
except Exception as rec_e:
|
||||
logger.error(f"Failed to end recording: {rec_e}")
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
{"Error": f"{domain}/{example_id} - {e}"}
|
||||
)
|
||||
)
|
||||
f.write("\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Task-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
except Exception as e:
|
||||
logger.error(f"Process-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
logger.info(f"{current_process().name} cleaning up environment...")
|
||||
try:
|
||||
if env:
|
||||
env.close()
|
||||
logger.info(f"{current_process().name} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"{current_process().name} error during environment cleanup: {e}")
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle termination signals (SIGINT, SIGTERM) to gracefully shutdown environments."""
|
||||
global is_terminating, active_environments, processes
|
||||
|
||||
# Avoid duplicate handling
|
||||
if is_terminating:
|
||||
return
|
||||
|
||||
is_terminating = True
|
||||
logger.info(f"Received signal {signum}. Gracefully shutting down...")
|
||||
|
||||
# Close all registered environments in the main process
|
||||
for env in active_environments:
|
||||
try:
|
||||
logger.info(f"Closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing environment: {e}")
|
||||
|
||||
# Send termination signal to all child processes first
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Sending termination signal to process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending termination signal to process: {e}")
|
||||
|
||||
# Allow a short time for processes to handle their own cleanup
|
||||
time.sleep(1)
|
||||
|
||||
# Forcefully terminate any processes that didn't exit
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Forcefully terminating process {p.name}...")
|
||||
import signal as sig
|
||||
os.kill(p.pid, sig.SIGKILL)
|
||||
except Exception as e:
|
||||
logger.error(f"Error forcefully terminating process: {e}")
|
||||
|
||||
logger.info("Shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
|
||||
global processes
|
||||
logger.info("Args: %s", args)
|
||||
all_tasks = distribute_tasks(test_all_meta)
|
||||
logger.info(f"Total tasks: {len(all_tasks)}")
|
||||
with Manager() as manager:
|
||||
shared_scores = manager.list()
|
||||
task_queue = manager.Queue()
|
||||
for item in all_tasks:
|
||||
task_queue.put(item)
|
||||
num_envs = args.num_envs
|
||||
processes = []
|
||||
for i in range(num_envs):
|
||||
p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-{i + 1}"
|
||||
)
|
||||
p.daemon = True
|
||||
p.start()
|
||||
processes.append(p)
|
||||
logger.info(f"Started process {p.name} with PID {p.pid}")
|
||||
try:
|
||||
while True:
|
||||
alive_count = 0
|
||||
for idx, p in enumerate(processes):
|
||||
if not p.is_alive():
|
||||
logger.warning(f"Process {p.name} died, restarting...")
|
||||
new_p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-Restart-{idx + 1}"
|
||||
)
|
||||
new_p.daemon = True
|
||||
new_p.start()
|
||||
processes[idx] = new_p
|
||||
logger.info(f"Restarted process {new_p.name} with PID {new_p.pid}")
|
||||
else:
|
||||
alive_count += 1
|
||||
if task_queue.empty():
|
||||
logger.info("All tasks finished.")
|
||||
break
|
||||
if alive_count == 0:
|
||||
logger.error("All processes died, exiting.")
|
||||
break
|
||||
time.sleep(5)
|
||||
for p in processes:
|
||||
p.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True)
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name} due to error...")
|
||||
p.terminate()
|
||||
except Exception as term_e:
|
||||
logger.error(f"Error terminating process {p.name}: {term_e}")
|
||||
raise
|
||||
scores = list(shared_scores)
|
||||
logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}")
|
||||
|
||||
|
||||
def get_unfinished(
|
||||
action_space, use_model, observation_type, result_dir, total_file_json
|
||||
):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
finished[domain] = []
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
if example_id == "onboard":
|
||||
continue
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" not in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
for file in os.listdir(example_path):
|
||||
os.remove(os.path.join(example_path, file))
|
||||
else:
|
||||
finished[domain].append(example_id)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [
|
||||
x for x in total_file_json[domain] if x not in examples
|
||||
]
|
||||
|
||||
return total_file_json
|
||||
|
||||
|
||||
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
if not os.path.exists(target_dir):
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
|
||||
all_result = []
|
||||
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
try:
|
||||
all_result.append(
|
||||
float(
|
||||
open(
|
||||
os.path.join(example_path, "result.txt"), "r"
|
||||
).read()
|
||||
)
|
||||
)
|
||||
except:
|
||||
all_result.append(0.0)
|
||||
|
||||
if not all_result:
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
else:
|
||||
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
|
||||
return all_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
# Register signal handlers for graceful termination
|
||||
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal
|
||||
|
||||
try:
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test(args, test_file_list)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt.")
|
||||
# Signal handler will take care of cleanup
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main process: {e}", exc_info=True)
|
||||
# Also trigger cleanup for unhandled exceptions
|
||||
signal_handler(signal.SIGTERM, None)
|
||||
finally:
|
||||
# Final cleanup in case any environments or processes remain
|
||||
logger.info("Main process final cleanup...")
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info(f"Closing environment in final cleanup...")
|
||||
env.close()
|
||||
logger.info(f"Environment closed successfully in final cleanup")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during final environment cleanup: {e}")
|
||||
|
||||
# First try gentle termination
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error terminating process: {e}")
|
||||
|
||||
# Wait a moment for processes to terminate
|
||||
time.sleep(1)
|
||||
|
||||
# Then force kill if needed
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Force killing process {p.name}...")
|
||||
os.kill(p.pid, signal.SIGKILL)
|
||||
logger.info(f"Process {p.name} force killed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error force killing process: {e}")
|
||||
568
run_multienv_mano.py
Normal file
568
run_multienv_mano.py
Normal file
@@ -0,0 +1,568 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import time
|
||||
from typing import List
|
||||
from multiprocessing import Process, Manager
|
||||
from multiprocessing import current_process
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import DesktopEnv
|
||||
from mm_agents.mano_agent import ManoAgent
|
||||
import os
|
||||
|
||||
# Global variables for signal handling
|
||||
active_environments = []
|
||||
processes = []
|
||||
is_terminating = False
|
||||
|
||||
# load the environment variables from .env file
|
||||
if os.path.exists(".env"):
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
# Logger Configs {{{ #
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run end-to-end evaluation on the benchmark"
|
||||
)
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--path_to_vm", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--headless", action="store_true", help="Run in headless machine"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--action_space", type=str, default="pyautogui", help="Action type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="screenshot",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=3.0)
|
||||
parser.add_argument("--max_steps", type=int, default=15)
|
||||
|
||||
# evaluation config
|
||||
parser.add_argument(
|
||||
"--test_config_base_dir", type=str, default="evaluation_examples"
|
||||
)
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="mano")
|
||||
parser.add_argument("--model_type", type=str, default="qwen25vl")
|
||||
parser.add_argument("--infer_mode", type=str, default="qwen25vl_normal")
|
||||
parser.add_argument("--prompt_style", type=str, default="qwen25vl_normal")
|
||||
parser.add_argument("--input_swap", action="store_true", help="Use copy and paste to type content")
|
||||
parser.add_argument("--language", type=str, default="Chinese")
|
||||
parser.add_argument("--max_pixels", type=float, default=16384*28*28)
|
||||
parser.add_argument("--min_pixels", type=float, default=100*28*28)
|
||||
parser.add_argument("--temperature", type=float, default=1.0)
|
||||
parser.add_argument("--top_p", type=float, default=0.9)
|
||||
parser.add_argument("--top_k", type=int, default=-1)
|
||||
parser.add_argument("--history_n", type=int, default=5)
|
||||
parser.add_argument("--callusr_tolerance", type=int, default=3)
|
||||
parser.add_argument("--max_tokens", type=int, default=1000)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=None, help="The max number of trajectory steps.")
|
||||
parser.add_argument("--max_image_history_length", type=int, default=5, help="The max number of images in the history.")
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument(
|
||||
"--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
|
||||
)
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
parser.add_argument("--num_envs", type=int, default=1, help="Number of environments to run in parallel")
|
||||
parser.add_argument("--log_level", type=str, choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'],
|
||||
default='INFO', help="Set the logging level")
|
||||
# aws config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider_name", type=str, default="aws", choices=["aws", "virtualbox", "vmware", "docker", "azure"], help="Provider name"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--client_password", type=str, default="", help="Client password"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_width", type=int, default=1920, help="Screen width"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_height", type=int, default=1080, help="Screen height"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
args = config() # Get command line arguments first
|
||||
|
||||
logger = logging.getLogger()
|
||||
log_level = getattr(logging, args.log_level.upper())
|
||||
logger.setLevel(log_level)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(
|
||||
os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
debug_handler = logging.FileHandler(
|
||||
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(log_level)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
# }}} Logger Configs #
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
def distribute_tasks(test_all_meta: dict) -> List[tuple]:
|
||||
all_tasks = []
|
||||
for domain, examples in test_all_meta.items():
|
||||
for example_id in examples:
|
||||
all_tasks.append((domain, example_id))
|
||||
return all_tasks
|
||||
|
||||
|
||||
def process_signal_handler(signum, frame, env_idx):
|
||||
"""Signal handler for child processes to gracefully shut down their environments."""
|
||||
logger.info(f"Process {env_idx + 1} received signal {signum}. Shutting down...")
|
||||
|
||||
# Get the active_environments from the caller's frame
|
||||
local_vars = frame.f_locals
|
||||
active_environments = local_vars.get('active_environments', [])
|
||||
|
||||
# Close environment in the current process context
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info(f"Process {env_idx + 1} closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Process {env_idx + 1} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Process {env_idx + 1} error closing environment: {e}")
|
||||
|
||||
logger.info(f"Process {env_idx + 1} shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: list):
|
||||
active_environments = []
|
||||
env = None
|
||||
try:
|
||||
#from desktop_env.providers.aws.manager import IMAGE_ID_MAP
|
||||
#REGION = args.region
|
||||
screen_size = (args.screen_width, args.screen_height)
|
||||
#ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)])
|
||||
env = DesktopEnv(
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
provider_name=args.provider_name,
|
||||
#region=REGION,
|
||||
#snapshot_name=ami_id,
|
||||
screen_size=screen_size,
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type in ["a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
#enable_proxy=True,
|
||||
#client_password=args.client_password
|
||||
)
|
||||
|
||||
|
||||
active_environments.append(env)
|
||||
args.max_trajectory_length = args.max_steps
|
||||
if args.infer_mode == "qwen25vl_normal":
|
||||
runtime_conf: dict = {
|
||||
"infer_mode": "qwen25vl_normal",
|
||||
"prompt_style": "qwen25vl_normal",
|
||||
"input_swap": False, #True,
|
||||
"language": "Chinese",
|
||||
"history_n": 3,
|
||||
"max_pixels": 16384*28*28,
|
||||
"min_pixels": 100*28*28,
|
||||
"callusr_tolerance": 3,
|
||||
"temperature": 0.0,
|
||||
"top_k": -1,
|
||||
"top_p": 0.9,
|
||||
"max_tokens": 1000
|
||||
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unknown infer_mode: {args.infer_mode}")
|
||||
|
||||
agent = ManoAgent(
|
||||
model=args.model,
|
||||
action_space=args.action_space,
|
||||
observation_type=args.observation_type,
|
||||
max_trajectory_length=args.max_trajectory_length,
|
||||
model_type=args.model_type,
|
||||
runtime_conf = runtime_conf
|
||||
)
|
||||
|
||||
logger.info(f"Process {current_process().name} started.")
|
||||
while True:
|
||||
try:
|
||||
item = task_queue.get(timeout=5)
|
||||
except Exception:
|
||||
break
|
||||
domain, example_id = item
|
||||
try:
|
||||
config_file = os.path.join(
|
||||
args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
|
||||
)
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
logger.info(f"[{current_process().name}][Domain]: {domain}")
|
||||
logger.info(f"[{current_process().name}][Example ID]: {example_id}")
|
||||
logger.info(f"[{current_process().name}][Instruction]: {example['instruction']}")
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
try:
|
||||
lib_run_single.run_single_example_mano(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
args.max_steps,
|
||||
example["instruction"],
|
||||
args,
|
||||
example_result_dir,
|
||||
shared_scores,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
env.controller.end_recording(
|
||||
os.path.join(example_result_dir, "recording.mp4")
|
||||
)
|
||||
except Exception as rec_e:
|
||||
logger.error(f"Failed to end recording: {rec_e}")
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(
|
||||
json.dumps(
|
||||
{"Error": f"{domain}/{example_id} - {e}"}
|
||||
)
|
||||
)
|
||||
f.write("\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Task-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
except Exception as e:
|
||||
logger.error(f"Process-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
logger.info(f"{current_process().name} cleaning up environment...")
|
||||
try:
|
||||
if env:
|
||||
env.close()
|
||||
logger.info(f"{current_process().name} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"{current_process().name} error during environment cleanup: {e}")
|
||||
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle termination signals (SIGINT, SIGTERM) to gracefully shutdown environments."""
|
||||
global is_terminating, active_environments, processes
|
||||
|
||||
# Avoid duplicate handling
|
||||
if is_terminating:
|
||||
return
|
||||
|
||||
is_terminating = True
|
||||
logger.info(f"Received signal {signum}. Gracefully shutting down...")
|
||||
|
||||
# Close all registered environments in the main process
|
||||
for env in active_environments:
|
||||
try:
|
||||
logger.info(f"Closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing environment: {e}")
|
||||
|
||||
# Send termination signal to all child processes first
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Sending termination signal to process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending termination signal to process: {e}")
|
||||
|
||||
# Allow a short time for processes to handle their own cleanup
|
||||
time.sleep(1)
|
||||
|
||||
# Forcefully terminate any processes that didn't exit
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Forcefully terminating process {p.name}...")
|
||||
import signal as sig
|
||||
os.kill(p.pid, sig.SIGKILL)
|
||||
except Exception as e:
|
||||
logger.error(f"Error forcefully terminating process: {e}")
|
||||
|
||||
logger.info("Shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
|
||||
global processes
|
||||
logger.info("Args: %s", args)
|
||||
all_tasks = distribute_tasks(test_all_meta)
|
||||
logger.info(f"Total tasks: {len(all_tasks)}")
|
||||
with Manager() as manager:
|
||||
shared_scores = manager.list()
|
||||
task_queue = manager.Queue()
|
||||
for item in all_tasks:
|
||||
task_queue.put(item)
|
||||
num_envs = args.num_envs
|
||||
processes = []
|
||||
for i in range(num_envs):
|
||||
p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-{i+1}"
|
||||
)
|
||||
p.daemon = True
|
||||
p.start()
|
||||
processes.append(p)
|
||||
logger.info(f"Started process {p.name} with PID {p.pid}")
|
||||
try:
|
||||
while True:
|
||||
alive_count = 0
|
||||
for idx, p in enumerate(processes):
|
||||
if not p.is_alive():
|
||||
logger.warning(f"Process {p.name} died, restarting...")
|
||||
new_p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-Restart-{idx+1}"
|
||||
)
|
||||
new_p.daemon = True
|
||||
new_p.start()
|
||||
processes[idx] = new_p
|
||||
logger.info(f"Restarted process {new_p.name} with PID {new_p.pid}")
|
||||
else:
|
||||
alive_count += 1
|
||||
if task_queue.empty():
|
||||
logger.info("All tasks finished.")
|
||||
break
|
||||
if alive_count == 0:
|
||||
logger.error("All processes died, exiting.")
|
||||
break
|
||||
time.sleep(5)
|
||||
for p in processes:
|
||||
p.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True)
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name} due to error...")
|
||||
p.terminate()
|
||||
except Exception as term_e:
|
||||
logger.error(f"Error terminating process {p.name}: {term_e}")
|
||||
raise
|
||||
scores = list(shared_scores)
|
||||
logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}")
|
||||
|
||||
|
||||
def get_unfinished(
|
||||
action_space, use_model, observation_type, result_dir, total_file_json
|
||||
):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
finished[domain] = []
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
if example_id == "onboard":
|
||||
continue
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" not in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
for file in os.listdir(example_path):
|
||||
os.remove(os.path.join(example_path, file))
|
||||
else:
|
||||
finished[domain].append(example_id)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [
|
||||
x for x in total_file_json[domain] if x not in examples
|
||||
]
|
||||
|
||||
return total_file_json
|
||||
|
||||
|
||||
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
if not os.path.exists(target_dir):
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
|
||||
all_result = []
|
||||
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
try:
|
||||
all_result.append(
|
||||
float(
|
||||
open(
|
||||
os.path.join(example_path, "result.txt"), "r"
|
||||
).read()
|
||||
)
|
||||
)
|
||||
except:
|
||||
all_result.append(0.0)
|
||||
|
||||
if not all_result:
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
else:
|
||||
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
|
||||
return all_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
# Register signal handlers for graceful termination
|
||||
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal
|
||||
|
||||
try:
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test(args, test_file_list)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt.")
|
||||
# Signal handler will take care of cleanup
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main process: {e}", exc_info=True)
|
||||
# Also trigger cleanup for unhandled exceptions
|
||||
signal_handler(signal.SIGTERM, None)
|
||||
finally:
|
||||
# Final cleanup in case any environments or processes remain
|
||||
logger.info("Main process final cleanup...")
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info(f"Closing environment in final cleanup...")
|
||||
env.close()
|
||||
logger.info(f"Environment closed successfully in final cleanup")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during final environment cleanup: {e}")
|
||||
|
||||
# First try gentle termination
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error terminating process: {e}")
|
||||
|
||||
# Wait a moment for processes to terminate
|
||||
time.sleep(1)
|
||||
|
||||
# Then force kill if needed
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Force killing process {p.name}...")
|
||||
os.kill(p.pid, signal.SIGKILL)
|
||||
logger.info(f"Process {p.name} force killed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error force killing process: {e}")
|
||||
511
run_multienv_qwen3vl.py
Normal file
511
run_multienv_qwen3vl.py
Normal file
@@ -0,0 +1,511 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import time
|
||||
from typing import List
|
||||
from multiprocessing import Process, Manager
|
||||
from multiprocessing import current_process
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import DesktopEnv
|
||||
from mm_agents.qwen3vl_agent import Qwen3VLAgent
|
||||
|
||||
# Global variables for signal handling
|
||||
active_environments = []
|
||||
processes = []
|
||||
is_terminating = False
|
||||
|
||||
# load the environment variables from .env file
|
||||
if os.path.exists(".env"):
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run end-to-end evaluation on the benchmark (Qwen3VL)"
|
||||
)
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--path_to_vm", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--headless", action="store_true", help="Run in headless machine"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--action_space", type=str, default="pyautogui", help="Action type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="screenshot",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=0.0)
|
||||
parser.add_argument("--max_steps", type=int, default=15)
|
||||
|
||||
# agent config
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=3)
|
||||
parser.add_argument(
|
||||
"--test_config_base_dir", type=str, default="evaluation_examples"
|
||||
)
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="qwen3-vl")
|
||||
parser.add_argument("--temperature", type=float, default=0)
|
||||
parser.add_argument("--top_p", type=float, default=0.9)
|
||||
parser.add_argument("--max_tokens", type=int, default=1500)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--coord",
|
||||
type=str,
|
||||
choices=["absolute", "relative"],
|
||||
default="absolute",
|
||||
help="Coordinate system for agent outputs (absolute or relative)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--add_thought_prefix",
|
||||
action="store_true",
|
||||
help="Add thought prefix to the response",
|
||||
)
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument(
|
||||
"--test_all_meta_path", type=str, default="evaluation_examples/test_nogdrive.json"
|
||||
)
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
parser.add_argument(
|
||||
"--num_envs", type=int, default=1, help="Number of environments to run in parallel"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log_level",
|
||||
type=str,
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
||||
default="INFO",
|
||||
help="Set the logging level",
|
||||
)
|
||||
|
||||
# provider config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider_name",
|
||||
type=str,
|
||||
default="docker",
|
||||
choices=["aws", "virtualbox", "vmware", "docker", "azure"],
|
||||
help="Provider name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--client_password", type=str, default="", help="Client password"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_width", type=int, default=1920, help="Screen width"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--screen_height", type=int, default=1080, help="Screen height"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
args = config() # Get command line arguments first
|
||||
|
||||
logger = logging.getLogger()
|
||||
log_level = getattr(logging, args.log_level.upper())
|
||||
logger.setLevel(log_level)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(
|
||||
os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
debug_handler = logging.FileHandler(
|
||||
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(log_level)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt=(
|
||||
"\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s "
|
||||
"\x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] "
|
||||
"\x1b[0m%(message)s"
|
||||
)
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
def distribute_tasks(test_all_meta: dict) -> List[tuple]:
|
||||
all_tasks = []
|
||||
for domain, examples in test_all_meta.items():
|
||||
for example_id in examples:
|
||||
all_tasks.append((domain, example_id))
|
||||
return all_tasks
|
||||
|
||||
|
||||
def run_env_tasks(task_queue, args: argparse.Namespace, shared_scores: list):
|
||||
active_environments = []
|
||||
env = None
|
||||
try:
|
||||
REGION = args.region
|
||||
screen_size = (args.screen_width, args.screen_height)
|
||||
snapshot_name = "init_state"
|
||||
if args.provider_name == "aws":
|
||||
from desktop_env.providers.aws.manager import IMAGE_ID_MAP
|
||||
ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)])
|
||||
snapshot_name = ami_id
|
||||
env = DesktopEnv(
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
provider_name=args.provider_name,
|
||||
region=REGION,
|
||||
snapshot_name=snapshot_name,
|
||||
screen_size=screen_size,
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type in [
|
||||
"a11y_tree",
|
||||
"screenshot_a11y_tree",
|
||||
"som",
|
||||
],
|
||||
enable_proxy=True,
|
||||
client_password=args.client_password,
|
||||
)
|
||||
active_environments.append(env)
|
||||
agent = Qwen3VLAgent(
|
||||
model=args.model,
|
||||
max_tokens=args.max_tokens,
|
||||
top_p=args.top_p,
|
||||
temperature=args.temperature,
|
||||
action_space=args.action_space,
|
||||
coordinate_type=args.coord,
|
||||
add_thought_prefix=args.add_thought_prefix,
|
||||
)
|
||||
logger.info(f"Process {current_process().name} started.")
|
||||
while True:
|
||||
try:
|
||||
item = task_queue.get(timeout=5)
|
||||
except Exception:
|
||||
break
|
||||
domain, example_id = item
|
||||
try:
|
||||
config_file = os.path.join(
|
||||
args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
|
||||
)
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
logger.info(f"[{current_process().name}][Domain]: {domain}")
|
||||
logger.info(f"[{current_process().name}][Example ID]: {example_id}")
|
||||
logger.info(f"[{current_process().name}][Instruction]: {example['instruction']}")
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
try:
|
||||
lib_run_single.run_single_example(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
args.max_steps,
|
||||
example["instruction"],
|
||||
args,
|
||||
example_result_dir,
|
||||
shared_scores,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}")
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
env.controller.end_recording(
|
||||
os.path.join(example_result_dir, "recording.mp4")
|
||||
)
|
||||
except Exception as rec_e:
|
||||
logger.error(f"Failed to end recording: {rec_e}")
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"}))
|
||||
f.write("\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Task-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
except Exception as e:
|
||||
logger.error(f"Process-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
logger.info(f"{current_process().name} cleaning up environment...")
|
||||
try:
|
||||
if env:
|
||||
env.close()
|
||||
logger.info(f"{current_process().name} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"{current_process().name} error during environment cleanup: {e}")
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
global is_terminating, active_environments, processes
|
||||
if is_terminating:
|
||||
return
|
||||
is_terminating = True
|
||||
logger.info(f"Received signal {signum}. Gracefully shutting down...")
|
||||
for env in active_environments:
|
||||
try:
|
||||
logger.info(f"Closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing environment: {e}")
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Sending termination signal to process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending termination signal to process: {e}")
|
||||
time.sleep(1)
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Forcefully terminating process {p.name}...")
|
||||
import signal as sig
|
||||
os.kill(p.pid, sig.SIGKILL)
|
||||
except Exception as e:
|
||||
logger.error(f"Error forcefully terminating process: {e}")
|
||||
logger.info("Shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
|
||||
global processes
|
||||
logger.info("Args: %s", args)
|
||||
all_tasks = distribute_tasks(test_all_meta)
|
||||
logger.info(f"Total tasks: {len(all_tasks)}")
|
||||
with Manager() as manager:
|
||||
shared_scores = manager.list()
|
||||
task_queue = manager.Queue()
|
||||
for item in all_tasks:
|
||||
task_queue.put(item)
|
||||
num_envs = args.num_envs
|
||||
processes = []
|
||||
for i in range(num_envs):
|
||||
p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-{i+1}"
|
||||
)
|
||||
p.daemon = True
|
||||
p.start()
|
||||
processes.append(p)
|
||||
logger.info(f"Started process {p.name} with PID {p.pid}")
|
||||
try:
|
||||
while True:
|
||||
alive_count = 0
|
||||
for idx, p in enumerate(processes):
|
||||
if not p.is_alive():
|
||||
logger.warning(f"Process {p.name} died, restarting...")
|
||||
new_p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-Restart-{idx+1}"
|
||||
)
|
||||
new_p.daemon = True
|
||||
new_p.start()
|
||||
processes[idx] = new_p
|
||||
logger.info(f"Restarted process {new_p.name} with PID {new_p.pid}")
|
||||
else:
|
||||
alive_count += 1
|
||||
if task_queue.empty():
|
||||
logger.info("All tasks finished.")
|
||||
break
|
||||
if alive_count == 0:
|
||||
logger.error("All processes died, exiting.")
|
||||
break
|
||||
time.sleep(5)
|
||||
for p in processes:
|
||||
p.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt. Initiating graceful shutdown...")
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error while waiting for processes: {e}", exc_info=True)
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name} due to error...")
|
||||
p.terminate()
|
||||
except Exception as term_e:
|
||||
logger.error(f"Error terminating process {p.name}: {term_e}")
|
||||
raise
|
||||
scores = list(shared_scores)
|
||||
logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}")
|
||||
|
||||
|
||||
def get_unfinished(
|
||||
action_space, use_model, observation_type, result_dir, total_file_json
|
||||
):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
finished[domain] = []
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
if example_id == "onboard":
|
||||
continue
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" not in os.listdir(example_path):
|
||||
for file in os.listdir(example_path):
|
||||
os.remove(os.path.join(example_path, file))
|
||||
else:
|
||||
finished[domain].append(example_id)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [
|
||||
x for x in total_file_json[domain] if x not in examples
|
||||
]
|
||||
|
||||
return total_file_json
|
||||
|
||||
|
||||
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
if not os.path.exists(target_dir):
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
|
||||
all_result = []
|
||||
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" in os.listdir(example_path):
|
||||
try:
|
||||
value_str = open(
|
||||
os.path.join(example_path, "result.txt"), "r"
|
||||
).read()
|
||||
all_result.append(float(value_str))
|
||||
except Exception:
|
||||
all_result.append(0.0)
|
||||
|
||||
if not all_result:
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
else:
|
||||
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
|
||||
return all_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
try:
|
||||
args = config()
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test(args, test_file_list)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt.")
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main process: {e}", exc_info=True)
|
||||
signal_handler(signal.SIGTERM, None)
|
||||
finally:
|
||||
logger.info("Main process final cleanup...")
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info("Closing environment in final cleanup...")
|
||||
env.close()
|
||||
logger.info("Environment closed successfully in final cleanup")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during final environment cleanup: {e}")
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error terminating process: {e}")
|
||||
time.sleep(1)
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Force killing process {p.name}...")
|
||||
os.kill(p.pid, signal.SIGKILL)
|
||||
logger.info(f"Process {p.name} force killed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error force killing process: {e}")
|
||||
|
||||
|
||||
560
run_multienv_uipath.py
Normal file
560
run_multienv_uipath.py
Normal file
@@ -0,0 +1,560 @@
|
||||
from __future__ import annotations
|
||||
import argparse
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import signal
|
||||
import time
|
||||
from typing import List
|
||||
from multiprocessing import Process, Manager
|
||||
from multiprocessing import current_process
|
||||
import lib_run_single
|
||||
from desktop_env.desktop_env import DesktopEnv
|
||||
from mm_agents.uipath_agent import UipathBaseAgent
|
||||
from queue import Queue
|
||||
|
||||
# Global variables for signal handling
|
||||
active_environments = []
|
||||
processes = []
|
||||
is_terminating = False
|
||||
|
||||
# import wandb
|
||||
|
||||
# load the environment variables from .env file
|
||||
if os.path.exists(".env"):
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
|
||||
# Logger Configs {{{ #
|
||||
def config() -> argparse.Namespace:
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Run end-to-end evaluation on the benchmark"
|
||||
)
|
||||
|
||||
# environment config
|
||||
parser.add_argument("--uipath_model_name", type=str, default="gpt-5-2025-08-07")
|
||||
|
||||
parser.add_argument("--path_to_vm", type=str, default=None)
|
||||
parser.add_argument(
|
||||
"--headless", action="store_true", help="Run in headless machine"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--action_space", type=str, default="pyautogui", help="Action type"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--observation_type",
|
||||
choices=["screenshot", "a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
default="screenshot",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--sleep_after_execution", type=float, default=0.0)
|
||||
parser.add_argument("--max_steps", type=int, default=15)
|
||||
|
||||
# agent config
|
||||
parser.add_argument("--max_trajectory_length", type=int, default=3)
|
||||
parser.add_argument(
|
||||
"--test_config_base_dir", type=str, default="evaluation_examples"
|
||||
)
|
||||
|
||||
# lm config
|
||||
parser.add_argument("--model", type=str, default="gpt-4o")
|
||||
parser.add_argument("--temperature", type=float, default=1.0)
|
||||
parser.add_argument("--top_p", type=float, default=0.9)
|
||||
parser.add_argument("--max_tokens", type=int, default=1500)
|
||||
parser.add_argument("--stop_token", type=str, default=None)
|
||||
|
||||
# example config
|
||||
parser.add_argument("--domain", type=str, default="all")
|
||||
parser.add_argument(
|
||||
"--test_all_meta_path", type=str, default="evaluation_examples/test_all.json"
|
||||
)
|
||||
|
||||
# logging related
|
||||
parser.add_argument("--result_dir", type=str, default="./results")
|
||||
parser.add_argument(
|
||||
"--num_envs",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of environments to run in parallel",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--log_level",
|
||||
type=str,
|
||||
choices=["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"],
|
||||
default="INFO",
|
||||
help="Set the logging level",
|
||||
)
|
||||
|
||||
# aws config
|
||||
parser.add_argument(
|
||||
"--region", type=str, default="us-east-1", help="AWS region for the VM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--provider_name",
|
||||
type=str,
|
||||
default="docker",
|
||||
choices=["aws", "virtualbox", "vmware", "docker", "azure"],
|
||||
help="Provider name",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--client_password", type=str, default="", help="Client password"
|
||||
)
|
||||
parser.add_argument("--screen_width", type=int, default=1920, help="Screen width")
|
||||
parser.add_argument("--screen_height", type=int, default=1080, help="Screen height")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
args = config() # Get command line arguments first
|
||||
|
||||
logger = logging.getLogger()
|
||||
log_level = getattr(logging, args.log_level.upper())
|
||||
logger.setLevel(log_level)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(
|
||||
os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
debug_handler = logging.FileHandler(
|
||||
os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8"
|
||||
)
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(log_level)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s"
|
||||
)
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
# }}} Logger Configs #
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
|
||||
def distribute_tasks(test_all_meta: dict) -> List[tuple]:
|
||||
all_tasks = []
|
||||
for domain, examples in test_all_meta.items():
|
||||
for example_id in examples:
|
||||
all_tasks.append((domain, example_id))
|
||||
return all_tasks
|
||||
|
||||
|
||||
def process_signal_handler(signum, frame, env_idx):
|
||||
"""Signal handler for child processes to gracefully shut down their environments."""
|
||||
logger.info(f"Process {env_idx + 1} received signal {signum}. Shutting down...")
|
||||
|
||||
# Get the active_environments from the caller's frame
|
||||
local_vars = frame.f_locals
|
||||
active_environments = local_vars.get("active_environments", [])
|
||||
|
||||
# Close environment in the current process context
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info(f"Process {env_idx + 1} closing environment...")
|
||||
env.close()
|
||||
logger.info(f"Process {env_idx + 1} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Process {env_idx + 1} error closing environment: {e}")
|
||||
|
||||
logger.info(f"Process {env_idx + 1} shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: list):
|
||||
active_environments = []
|
||||
env = None
|
||||
try:
|
||||
# from desktop_env.providers.aws.manager import IMAGE_ID_MAP
|
||||
# REGION = args.region
|
||||
screen_size = (args.screen_width, args.screen_height)
|
||||
# ami_id = IMAGE_ID_MAP[REGION].get(screen_size, IMAGE_ID_MAP[REGION][(1920, 1080)])
|
||||
env = DesktopEnv(
|
||||
path_to_vm=args.path_to_vm,
|
||||
action_space=args.action_space,
|
||||
provider_name=args.provider_name,
|
||||
# region=REGION,
|
||||
# snapshot_name=ami_id,
|
||||
screen_size=screen_size,
|
||||
headless=args.headless,
|
||||
os_type="Ubuntu",
|
||||
require_a11y_tree=args.observation_type
|
||||
in ["a11y_tree", "screenshot_a11y_tree", "som"],
|
||||
enable_proxy=False,
|
||||
client_password=args.client_password,
|
||||
)
|
||||
active_environments.append(env)
|
||||
agent = UipathBaseAgent(
|
||||
model=args.model,
|
||||
action_space=args.action_space,
|
||||
observation_type=args.observation_type,
|
||||
client_password=args.client_password,
|
||||
)
|
||||
|
||||
logger.info(f"Process {current_process().name} started.")
|
||||
while True:
|
||||
try:
|
||||
item = task_queue.get(timeout=5)
|
||||
except Exception:
|
||||
break
|
||||
domain, example_id = item
|
||||
try:
|
||||
config_file = os.path.join(
|
||||
args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
|
||||
)
|
||||
with open(config_file, "r", encoding="utf-8") as f:
|
||||
example = json.load(f)
|
||||
logger.info(f"[{current_process().name}][Domain]: {domain}")
|
||||
logger.info(f"[{current_process().name}][Example ID]: {example_id}")
|
||||
logger.info(
|
||||
f"[{current_process().name}][Instruction]: {example['instruction']}"
|
||||
)
|
||||
example_result_dir = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
domain,
|
||||
example_id,
|
||||
)
|
||||
os.makedirs(example_result_dir, exist_ok=True)
|
||||
try:
|
||||
lib_run_single.run_single_example_uipath(
|
||||
agent,
|
||||
env,
|
||||
example,
|
||||
args.max_steps,
|
||||
example["instruction"],
|
||||
args,
|
||||
example_result_dir,
|
||||
shared_scores,
|
||||
)
|
||||
except Exception as e:
|
||||
import traceback
|
||||
|
||||
logger.error(
|
||||
f"Exception in {current_process().name} {domain}/{example_id}: {e}"
|
||||
)
|
||||
logger.error(traceback.format_exc())
|
||||
try:
|
||||
env.controller.end_recording(
|
||||
os.path.join(example_result_dir, "recording.mp4")
|
||||
)
|
||||
except Exception as rec_e:
|
||||
logger.error(f"Failed to end recording: {rec_e}")
|
||||
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
|
||||
f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"}))
|
||||
f.write("\n")
|
||||
except Exception as e:
|
||||
logger.error(f"Task-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
|
||||
logger.error(traceback.format_exc())
|
||||
except Exception as e:
|
||||
logger.error(f"Process-level error in {current_process().name}: {e}")
|
||||
import traceback
|
||||
|
||||
logger.error(traceback.format_exc())
|
||||
finally:
|
||||
logger.info(f"{current_process().name} cleaning up environment...")
|
||||
try:
|
||||
if env:
|
||||
env.close()
|
||||
logger.info(f"{current_process().name} environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"{current_process().name} error during environment cleanup: {e}"
|
||||
)
|
||||
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
"""Handle termination signals (SIGINT, SIGTERM) to gracefully shutdown environments."""
|
||||
global is_terminating, active_environments, processes
|
||||
|
||||
# Avoid duplicate handling
|
||||
if is_terminating:
|
||||
return
|
||||
|
||||
is_terminating = True
|
||||
logger.info(f"Received signal {signum}. Gracefully shutting down...")
|
||||
|
||||
# Close all registered environments in the main process
|
||||
for env in active_environments:
|
||||
try:
|
||||
logger.info("Closing environment...")
|
||||
env.close()
|
||||
logger.info("Environment closed successfully")
|
||||
except Exception as e:
|
||||
logger.error(f"Error closing environment: {e}")
|
||||
|
||||
# Send termination signal to all child processes first
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Sending termination signal to process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error sending termination signal to process: {e}")
|
||||
|
||||
# Allow a short time for processes to handle their own cleanup
|
||||
time.sleep(1)
|
||||
|
||||
# Forcefully terminate any processes that didn't exit
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Forcefully terminating process {p.name}...")
|
||||
import signal as sig
|
||||
|
||||
os.kill(p.pid, sig.SIGKILL)
|
||||
except Exception as e:
|
||||
logger.error(f"Error forcefully terminating process: {e}")
|
||||
|
||||
logger.info("Shutdown complete. Exiting.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
def test(args: argparse.Namespace, test_all_meta: dict) -> None:
|
||||
global processes
|
||||
logger.info("Args: %s", args)
|
||||
all_tasks = distribute_tasks(test_all_meta)
|
||||
logger.info(f"Total tasks: {len(all_tasks)}")
|
||||
with Manager() as manager:
|
||||
shared_scores = manager.list()
|
||||
task_queue = manager.Queue()
|
||||
for item in all_tasks:
|
||||
task_queue.put(item)
|
||||
num_envs = args.num_envs
|
||||
processes = []
|
||||
for i in range(num_envs):
|
||||
p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-{i + 1}",
|
||||
)
|
||||
p.daemon = True
|
||||
p.start()
|
||||
processes.append(p)
|
||||
logger.info(f"Started process {p.name} with PID {p.pid}")
|
||||
try:
|
||||
while True:
|
||||
alive_count = 0
|
||||
for idx, p in enumerate(processes):
|
||||
if not p.is_alive():
|
||||
logger.warning(f"Process {p.name} died, restarting...")
|
||||
new_p = Process(
|
||||
target=run_env_tasks,
|
||||
args=(task_queue, args, shared_scores),
|
||||
name=f"EnvProcess-Restart-{idx + 1}",
|
||||
)
|
||||
new_p.daemon = True
|
||||
new_p.start()
|
||||
processes[idx] = new_p
|
||||
logger.info(
|
||||
f"Restarted process {new_p.name} with PID {new_p.pid}"
|
||||
)
|
||||
else:
|
||||
alive_count += 1
|
||||
if task_queue.empty():
|
||||
logger.info("All tasks finished.")
|
||||
break
|
||||
if alive_count == 0:
|
||||
logger.error("All processes died, exiting.")
|
||||
break
|
||||
time.sleep(5)
|
||||
for p in processes:
|
||||
p.join()
|
||||
except KeyboardInterrupt:
|
||||
logger.info(
|
||||
"Main process received KeyboardInterrupt. Initiating graceful shutdown..."
|
||||
)
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
f"Unexpected error while waiting for processes: {e}", exc_info=True
|
||||
)
|
||||
for p in processes:
|
||||
if p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name} due to error...")
|
||||
p.terminate()
|
||||
except Exception as term_e:
|
||||
logger.error(f"Error terminating process {p.name}: {term_e}")
|
||||
raise
|
||||
scores = list(shared_scores)
|
||||
logger.info(f"Average score: {sum(scores) / len(scores) if scores else 0}")
|
||||
|
||||
|
||||
def get_unfinished(
|
||||
action_space, use_model, observation_type, result_dir, total_file_json
|
||||
):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
|
||||
if not os.path.exists(target_dir):
|
||||
return total_file_json
|
||||
|
||||
finished = {}
|
||||
for domain in os.listdir(target_dir):
|
||||
finished[domain] = []
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
if example_id == "onboard":
|
||||
continue
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" not in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
for file in os.listdir(example_path):
|
||||
os.remove(os.path.join(example_path, file))
|
||||
else:
|
||||
finished[domain].append(example_id)
|
||||
|
||||
if not finished:
|
||||
return total_file_json
|
||||
|
||||
for domain, examples in finished.items():
|
||||
if domain in total_file_json:
|
||||
total_file_json[domain] = [
|
||||
x for x in total_file_json[domain] if x not in examples
|
||||
]
|
||||
|
||||
return total_file_json
|
||||
|
||||
|
||||
def get_result(action_space, use_model, observation_type, result_dir, total_file_json):
|
||||
target_dir = os.path.join(result_dir, action_space, observation_type, use_model)
|
||||
if not os.path.exists(target_dir):
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
|
||||
all_result = []
|
||||
|
||||
for domain in os.listdir(target_dir):
|
||||
domain_path = os.path.join(target_dir, domain)
|
||||
if os.path.isdir(domain_path):
|
||||
for example_id in os.listdir(domain_path):
|
||||
example_path = os.path.join(domain_path, example_id)
|
||||
if os.path.isdir(example_path):
|
||||
if "result.txt" in os.listdir(example_path):
|
||||
# empty all files under example_id
|
||||
try:
|
||||
with open(
|
||||
os.path.join(example_path, "result.txt"), "r"
|
||||
) as f:
|
||||
all_result.append(float(f.read()))
|
||||
except (FileNotFoundError, ValueError, OSError):
|
||||
all_result.append(0.0)
|
||||
|
||||
if not all_result:
|
||||
print("New experiment, no result yet.")
|
||||
return None
|
||||
else:
|
||||
print("Current Success Rate:", sum(all_result) / len(all_result) * 100, "%")
|
||||
return all_result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
####### The complete version of the list of examples #######
|
||||
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
||||
|
||||
# Register signal handlers for graceful termination
|
||||
signal.signal(signal.SIGINT, signal_handler) # Handle Ctrl+C
|
||||
signal.signal(signal.SIGTERM, signal_handler) # Handle termination signal
|
||||
|
||||
try:
|
||||
args = config()
|
||||
|
||||
# save args to json in result_dir/action_space/observation_type/model/args.json
|
||||
path_to_args = os.path.join(
|
||||
args.result_dir,
|
||||
args.action_space,
|
||||
args.observation_type,
|
||||
args.model,
|
||||
"args.json",
|
||||
)
|
||||
os.makedirs(os.path.dirname(path_to_args), exist_ok=True)
|
||||
with open(path_to_args, "w", encoding="utf-8") as f:
|
||||
json.dump(vars(args), f, indent=4)
|
||||
|
||||
with open(args.test_all_meta_path, "r", encoding="utf-8") as f:
|
||||
test_all_meta = json.load(f)
|
||||
|
||||
if args.domain != "all":
|
||||
test_all_meta = {args.domain: test_all_meta[args.domain]}
|
||||
|
||||
test_file_list = get_unfinished(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
left_info = ""
|
||||
for domain in test_file_list:
|
||||
left_info += f"{domain}: {len(test_file_list[domain])}\n"
|
||||
logger.info(f"Left tasks:\n{left_info}")
|
||||
|
||||
get_result(
|
||||
args.action_space,
|
||||
args.model,
|
||||
args.observation_type,
|
||||
args.result_dir,
|
||||
test_all_meta,
|
||||
)
|
||||
test(args, test_file_list)
|
||||
except KeyboardInterrupt:
|
||||
logger.info("Main process received KeyboardInterrupt.")
|
||||
# Signal handler will take care of cleanup
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error in main process: {e}", exc_info=True)
|
||||
# Also trigger cleanup for unhandled exceptions
|
||||
signal_handler(signal.SIGTERM, None)
|
||||
finally:
|
||||
# Final cleanup in case any environments or processes remain
|
||||
logger.info("Main process final cleanup...")
|
||||
for env in active_environments:
|
||||
if env is not None:
|
||||
try:
|
||||
logger.info("Closing environment in final cleanup...")
|
||||
env.close()
|
||||
logger.info("Environment closed successfully in final cleanup")
|
||||
except Exception as e:
|
||||
logger.error(f"Error during final environment cleanup: {e}")
|
||||
|
||||
# First try gentle termination
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Terminating process {p.name}...")
|
||||
p.terminate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error terminating process: {e}")
|
||||
|
||||
# Wait a moment for processes to terminate
|
||||
time.sleep(1)
|
||||
|
||||
# Then force kill if needed
|
||||
for p in processes:
|
||||
if p is not None and p.is_alive():
|
||||
try:
|
||||
logger.info(f"Force killing process {p.name}...")
|
||||
os.kill(p.pid, signal.SIGKILL)
|
||||
logger.info(f"Process {p.name} force killed")
|
||||
except Exception as e:
|
||||
logger.error(f"Error force killing process: {e}")
|
||||
Reference in New Issue
Block a user