Add AutoGLM-OS agent (#309)
* autoglm-os initialize * clean code * chore: use proxy for download setup * feat(autoglm-os): add parameter to toggle images * fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel * update * add client_password * update multienv * fix * fix prompt * fix prompt * fix prompt * fix sys prompt * feat: use proxy in file evaluator * fix client_password * fix note_prompt * fix autoglm agent cmd type * fix * revert: fix: use temporary directory for files pulled from the vm to prevent potential collision when running multiple instances of the same task in parallel reverts commit bab5473eea1de0e61b0e1d68b23ce324a5b0ee57 * feat(autoglm): setup tools * fix(autoglm): remove second time of get a11y tree * add osworld server restart * Revert "add osworld server restart" This reverts commit 7bd9d84122e246ce2a26de0e49c25494244c2b3d. * fix _launch_setup * fix autoglm agent tools & xml tree * fix desktop_env * fix bug for tool name capitalization * fix: always use proxy for setup download * add fail after exceeding max turns * fix(autoglm): avoid adding image to message when screenshot is empty * fix maximize_window * fix maximize_window * fix maximize_window * fix import browsertools module bug * fix task proxy config bug * restore setup * refactor desktop env * restore image in provider * restore file.py * refactor desktop_env * quick fix * refactor desktop_env.step * fix our env reset * add max truns constraint * clean run script * clean lib_run_single.py --------- Co-authored-by: hanyullai <hanyullai@outlook.com> Co-authored-by: JingBh <jingbohao@yeah.net>
This commit is contained in:
committed by
GitHub
parent
c833d03a4b
commit
aa05f6cc26
259
mm_agents/autoglm/prompt/grounding_agent.py
Normal file
259
mm_agents/autoglm/prompt/grounding_agent.py
Normal file
@@ -0,0 +1,259 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import xml.etree.ElementTree as ET
|
||||
from typing import Dict, List, Optional, Tuple
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
def agent_action(func):
|
||||
func.is_agent_action = True
|
||||
return func
|
||||
|
||||
|
||||
switch_window_code = """import subprocess;
|
||||
import pyautogui;
|
||||
pyautogui.press('escape');
|
||||
time.sleep(0.5);
|
||||
subprocess.run(['wmctrl', '-ia', 'WINDOW_ID'])
|
||||
subprocess.run(['wmctrl', '-ir', 'WINDOW_ID', '-b', 'add,maximized_vert,maximized_horz'])
|
||||
print('Switch to WINDOW_ID')"""
|
||||
|
||||
launch_app_commands = {
|
||||
# Web Browser
|
||||
"chrome": "google-chrome --remote-debugging-port=1337",
|
||||
# File Manager
|
||||
"files": "nautilus",
|
||||
# Terminal
|
||||
"terminal": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-terminal',
|
||||
# Utilities
|
||||
"gedit": "gedit",
|
||||
# Office
|
||||
"libreoffice writer": "libreoffice --writer",
|
||||
"libreoffice calc": "libreoffice --calc",
|
||||
"libreoffice impress": "libreoffice --impress",
|
||||
# System
|
||||
"settings": 'export DBUS_SESSION_BUS_ADDRESS="unix:path=/run/user/1000/bus" && gnome-control-center',
|
||||
# Multimedia
|
||||
"vlc": "vlc",
|
||||
"gimp": "gimp",
|
||||
# IDE
|
||||
"vs code": "code",
|
||||
# Email
|
||||
"thunderbird": "thunderbird",
|
||||
}
|
||||
|
||||
|
||||
class GroundingAgent:
|
||||
|
||||
tool_list = {
|
||||
"libreoffice_calc": "CalcTools",
|
||||
"libreoffice_impress": "ImpressTools",
|
||||
"libreoffice_writer": "WriterTools",
|
||||
"code": "CodeTools",
|
||||
"vlc": "VLCTools",
|
||||
"google_chrome": "BrowserTools",
|
||||
}
|
||||
|
||||
@classmethod
|
||||
def tool_commands(cls, code: str, tool_name: str):
|
||||
command = f"from {tool_name} import *; "
|
||||
command += code
|
||||
|
||||
tool_class = cls.tool_list[tool_name]
|
||||
command += f"; {tool_class}.print_result()"
|
||||
|
||||
return [
|
||||
command,
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def click(
|
||||
cls,
|
||||
coordinates: List,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
):
|
||||
"""
|
||||
Click on the element.
|
||||
|
||||
Args:
|
||||
coordinates (List): [x, y], Coordinates of the element to click on
|
||||
num_clicks (int): number of times to click the element
|
||||
button_type (str): which mouse button to press can be "left", "middle", or "right"
|
||||
"""
|
||||
command = ""
|
||||
x, y = coordinates
|
||||
command += f"""pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); print("Click Success")""" # TODO: 最大化窗口需要一次调用
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def type(
|
||||
cls,
|
||||
coordinates: Optional[List] = None,
|
||||
text: str = "",
|
||||
overwrite: bool = False,
|
||||
enter: bool = False,
|
||||
):
|
||||
"""
|
||||
Type text into the element.
|
||||
|
||||
Args:
|
||||
coordinates (List): [x, y] Coordinates of the element to type into. If not provided, typing will start at the current cursor location.
|
||||
text (str): the text to type
|
||||
overwrite (bool): Assign it to True if the text should overwrite the existing text, otherwise assign it to False. Using this argument clears all text in an element.
|
||||
enter (bool): Assign it to True if the enter key should be pressed after typing the text, otherwise assign it to False.
|
||||
"""
|
||||
|
||||
command = ""
|
||||
|
||||
if coordinates is not None:
|
||||
# Start typing at the center of the element
|
||||
x, y = coordinates
|
||||
command += f"pyautogui.click({x}, {y}); "
|
||||
|
||||
if overwrite:
|
||||
command += f"pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace'); "
|
||||
|
||||
command += f"pyautogui.write({repr(text)}); "
|
||||
|
||||
if enter:
|
||||
command += "pyautogui.press('enter'); "
|
||||
|
||||
command += "print('Type Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def drag_and_drop(cls, drag_from_coordinates: List, drop_on_coordinates: List):
|
||||
"""
|
||||
Drag element1 and drop it on element2.
|
||||
|
||||
Args:
|
||||
drag_from_coordinates (List): [x, y] Coordinates of element to drag
|
||||
drop_on_coordinates (List): [x, y] Coordinates of element to drop on
|
||||
"""
|
||||
x1, y1 = drag_from_coordinates
|
||||
x2, y2 = drop_on_coordinates
|
||||
|
||||
command = f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# TODO: specified duration?
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=1.); pyautogui.mouseUp(); "
|
||||
|
||||
command += "print('Drag and Drop Success')"
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def scroll(cls, coordinates: List, direction: str):
|
||||
"""
|
||||
Scroll the element in the specified direction.
|
||||
|
||||
Args:
|
||||
coordinates (List): [x, y] Coordinates of the element to scroll in
|
||||
direction (str): the direction to scroll can be "up" or "down".
|
||||
"""
|
||||
x, y = coordinates
|
||||
amount = 100 if direction == "up" else -100
|
||||
return f"import pyautogui; pyautogui.moveTo({x}, {y}); pyautogui.scroll({amount}); print('Scroll Success')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def open_app(cls, app_name: str):
|
||||
"""
|
||||
Open a specified application.
|
||||
|
||||
App List:
|
||||
- chrome
|
||||
- files
|
||||
- terminal
|
||||
- gedit
|
||||
- libreoffice writer
|
||||
- libreoffice calc
|
||||
- libreoffice impress
|
||||
- vs code
|
||||
- vlc
|
||||
- gimp
|
||||
- settings
|
||||
- thunderbird
|
||||
|
||||
Args:
|
||||
app_name (str): Name of the application to open
|
||||
"""
|
||||
|
||||
app_name = app_name.lower().strip()
|
||||
|
||||
if app_name not in launch_app_commands:
|
||||
command = f"print(f'{app_name} is not supported or recognized')"
|
||||
else:
|
||||
command = {
|
||||
"action_type": "OPEN_APP",
|
||||
"parameters": {"launch_app_command": launch_app_commands[app_name], "app_name": app_name},
|
||||
}
|
||||
|
||||
return command
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def switch_window(cls, window_id: str):
|
||||
"""
|
||||
Switch to the window with the given window id.
|
||||
|
||||
Args:
|
||||
window_id (str): the window id to switch to from the provided list of open windows
|
||||
"""
|
||||
return switch_window_code.replace("WINDOW_ID", window_id)
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def hotkey(cls, keys: List):
|
||||
"""
|
||||
Press a hotkey combination.
|
||||
|
||||
Args:
|
||||
keys (List): the keys to press in combination in a list format (e.g. ['ctrl', 'c'] for copy, ['prtsc'] for screenshot)
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
key_str = ", ".join(keys).replace("'", "\\'")
|
||||
return f"import pyautogui; pyautogui.hotkey({', '.join(keys)}); print(f'Press Hotkey: {key_str}')"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def quote(cls, content: str):
|
||||
"""
|
||||
Quoting information from the current page for memory. Only you can see the quoted content.
|
||||
|
||||
Args:
|
||||
content (str): text summarized or copied from the page for later operation.
|
||||
"""
|
||||
return f'''print("""{content}""")'''
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def wait(cls):
|
||||
"""
|
||||
Wait for a while.
|
||||
|
||||
"""
|
||||
return "WAIT"
|
||||
|
||||
@classmethod
|
||||
@agent_action
|
||||
def exit(cls, success: bool):
|
||||
"""
|
||||
End the current task.
|
||||
|
||||
Args:
|
||||
success (bool): True if successfully finish a task, otherwise set it False
|
||||
"""
|
||||
if success:
|
||||
return "DONE"
|
||||
else:
|
||||
return "FAIL"
|
||||
Reference in New Issue
Block a user