575 lines
27 KiB
Python
Executable File
575 lines
27 KiB
Python
Executable File
import re
|
|
from typing import Any, Dict, List, Optional, Tuple
|
|
|
|
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
|
from mm_agents.os_symphony.core.mllm import LMMAgent
|
|
from mm_agents.os_symphony.utils.common_utils import call_llm_safe
|
|
from mm_agents.os_symphony.agents.coder_agent import CoderAgent
|
|
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
|
|
from mm_agents.os_symphony.agents.searcher_agent import SearcherAgent
|
|
import logging
|
|
from mm_agents.os_symphony.agents.ocr import OCRProcessor
|
|
|
|
|
|
logger = logging.getLogger("desktopenv.agent")
|
|
|
|
# Agent action decorator
|
|
def agent_action(func):
|
|
func.is_agent_action = True
|
|
return func
|
|
|
|
# GrounderAgent primitives are parameterized by description, and coordinate generation uses a pretrained grounding model
|
|
class OSACI:
|
|
def __init__(
|
|
self,
|
|
env,
|
|
search_env,
|
|
platform: str,
|
|
client_password: str,
|
|
engine_params_for_ocr: Dict,
|
|
engine_params_for_grounder: Dict,
|
|
engine_params_for_coder: Dict,
|
|
engine_params_for_searcher: Dict,
|
|
screen_width: int = 1920,
|
|
screen_height: int = 1080
|
|
):
|
|
|
|
self.env = env
|
|
self.platform = platform
|
|
self.client_password = client_password
|
|
|
|
self.result_dir = ""
|
|
|
|
self.grounder_agent = GrounderAgent(engine_params=engine_params_for_grounder, screen_width=screen_width, screen_height=screen_height)
|
|
|
|
# Configure text grounding agent
|
|
self.ocr_processor = OCRProcessor()
|
|
self.text_span_agent = LMMAgent(
|
|
engine_params=engine_params_for_ocr,
|
|
system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,
|
|
)
|
|
|
|
# Configure code agent
|
|
self.coder_agent = CoderAgent(
|
|
engine_params=engine_params_for_coder,
|
|
platform=self.platform,
|
|
client_password=client_password
|
|
)
|
|
|
|
# Configure search agent
|
|
self.searcher_agent = SearcherAgent.create(
|
|
engine_params=engine_params_for_searcher,
|
|
search_env=search_env,
|
|
grounder_agent=self.grounder_agent,
|
|
platform=self.platform,
|
|
client_password=self.client_password
|
|
)
|
|
|
|
# Store task instruction for code agent
|
|
self.current_task_instruction = None
|
|
self.last_code_agent_result = None
|
|
self.last_search_agent_result = None
|
|
self.notes: List[str] = []
|
|
# Tutorial should be a global info, not a local context, so how to add it to the global info
|
|
self.tutorials = []
|
|
|
|
|
|
def assign_screenshot(self, obs):
|
|
self.obs = obs
|
|
|
|
# Given the state and worker's text phrase, generate the coords of the first/last word in the phrase
|
|
def generate_text_coords(
|
|
self, phrase: str, obs: Dict, alignment: str = ""
|
|
) -> List[int]:
|
|
|
|
screenshot, global_offset_x, global_offset_y= obs["screenshot"], 0, 0
|
|
|
|
ocr_table, ocr_elements = self.ocr_processor.get_ocr_elements(screenshot, "easyocr")
|
|
|
|
alignment_prompt = ""
|
|
if alignment == "start":
|
|
alignment_prompt = "**Important**: Output the word id of the FIRST word in the provided phrase.\n"
|
|
elif alignment == "end":
|
|
alignment_prompt = "**Important**: Output the word id of the LAST word in the provided phrase.\n"
|
|
|
|
# Load LLM prompt
|
|
self.text_span_agent.reset()
|
|
self.text_span_agent.add_message(
|
|
alignment_prompt + "Phrase: " + phrase + "\n" + ocr_table, role="user"
|
|
)
|
|
self.text_span_agent.add_message(
|
|
"Screenshot:\n", image_content=screenshot, role="user"
|
|
)
|
|
|
|
# Obtain the target element
|
|
response = call_llm_safe(self.text_span_agent)
|
|
print("TEXT SPAN AGENT RESPONSE:", response)
|
|
numericals = re.findall(r"\d+", response)
|
|
if len(numericals) > 0:
|
|
text_id = int(numericals[-1])
|
|
else:
|
|
text_id = 0
|
|
elem = ocr_elements[text_id]
|
|
|
|
# Compute the element coordinates
|
|
# Note: 0.1 * elem["height"] is used to adjust coordinates to select the last character more precisely.
|
|
if alignment == "start":
|
|
coords = [elem["left"], elem["top"] + (elem["height"] // 2)]
|
|
elif alignment == "end":
|
|
coords = [elem["left"] + elem["width"] + 0.15 * elem["height"], elem["top"] + (elem["height"] // 2)]
|
|
|
|
print(f'[OCR] output coordinates: {[coords[0] + global_offset_x, coords[1] + global_offset_y]}')
|
|
return [int(coords[0] + global_offset_x), int(coords[1] + global_offset_y)]
|
|
|
|
def set_task_instruction(self, task_instruction: str):
|
|
"""Set the current task instruction for the code agent."""
|
|
self.current_task_instruction = task_instruction
|
|
|
|
@agent_action
|
|
def click(
|
|
self,
|
|
element_description: str,
|
|
num_clicks: int = 1,
|
|
button_type: str = "left",
|
|
hold_keys: List = []
|
|
):
|
|
"""Click on the element
|
|
Args:
|
|
element_description:str, a detailed descriptions of which element to click on. This description needs to be VERY unambiguous. If the page contains many similar elements, ensure the description uniquely identifies the target element.
|
|
num_clicks:int, number of times to click the element
|
|
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
|
hold_keys:List, list of keys to hold while clicking
|
|
"""
|
|
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
|
|
|
command = "import pyautogui; "
|
|
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyDown({repr(k)}); "
|
|
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyUp({repr(k)}); "
|
|
# Return pyautoguicode to click on the element
|
|
|
|
action = {"function": "click", "args": {"x": x, "y": y, "button": button_type, "clicks": num_clicks}}
|
|
return (command, action)
|
|
|
|
@agent_action
|
|
def open(self, app_or_filename: str):
|
|
"""Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
|
|
Args:
|
|
app_or_filename:str, the name of the application or filename to open
|
|
|
|
**Important**:
|
|
Provide only the name of the application or file. Do not include the full path (e.g., "/home/user/Desktop/my_report.docx"). The function works by searching for the name, not by accessing a file path directly.
|
|
"""
|
|
action = {"function": "open", "args": {"name": app_or_filename}}
|
|
if self.platform == "linux":
|
|
return (f"import pyautogui; pyautogui.hotkey('win'); time.sleep(1.0); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(1.0)", action)
|
|
elif self.platform == "macos":
|
|
return (f"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)", action)
|
|
elif self.platform == "windows":
|
|
return (f"import pyautogui; import time; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.press('enter'); time.sleep(0.5)", action)
|
|
else:
|
|
assert (
|
|
False
|
|
), f"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows."
|
|
|
|
def _paste(self, is_terminal):
|
|
if self.platform == 'macos':
|
|
return "pyautogui.hotkey('command', 'v');"
|
|
|
|
elif self.platform == 'linux':
|
|
if is_terminal:
|
|
return "pyautogui.hotkey('ctrl', 'shift', 'v');"
|
|
else:
|
|
return "pyautogui.hotkey('ctrl', 'v');"
|
|
|
|
elif self.platform == 'windows':
|
|
return "pyautogui.hotkey('ctrl', 'v');"
|
|
|
|
return ""
|
|
|
|
def _clear_all(self, is_terminal):
|
|
"""
|
|
Clean the content of current line
|
|
"""
|
|
# common apps in GUI
|
|
if not is_terminal:
|
|
if self.platform == 'macos':
|
|
# macOS GUI: Command + A -> Backspace
|
|
return "pyautogui.hotkey('command', 'a'); pyautogui.press('backspace');"
|
|
else:
|
|
# Windows/Linux GUI: Ctrl + A -> Backspace
|
|
return "pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace');"
|
|
|
|
# terminal
|
|
else:
|
|
if self.platform == 'windows':
|
|
return "pyautogui.press('esc');"
|
|
else:
|
|
return "pyautogui.hotkey('ctrl', 'e'); pyautogui.hotkey('ctrl', 'u');"
|
|
|
|
def _type(
|
|
self,
|
|
text: str,
|
|
is_terminal: bool
|
|
):
|
|
"""
|
|
use copy and paste to input Chinese, otherwise type normally
|
|
"""
|
|
commands = ""
|
|
has_unicode = any(ord(char) > 127 for char in text)
|
|
if has_unicode and self.platform != "macos":
|
|
commands += (
|
|
"original_clipboard = pyperclip.paste();"
|
|
f"pyperclip.copy({repr(text)});"
|
|
"time.sleep(0.1);"
|
|
)
|
|
commands += self._paste(is_terminal=is_terminal)
|
|
commands += "pyperclip.copy(original_clipboard);"
|
|
else:
|
|
commands += f"pyautogui.write({repr(text)}, interval=0.1);"
|
|
|
|
return commands
|
|
|
|
@agent_action
|
|
def type(
|
|
self,
|
|
element_description: str,
|
|
text: str = "",
|
|
overwrite: bool = False,
|
|
enter: bool = False,
|
|
is_terminal = False
|
|
):
|
|
"""Type text/unicode into a specific element
|
|
Args:
|
|
element_description: str, a detailed description of which element to enter text in. If provided, the agent will click on this element before typing.
|
|
text:str, the text to type
|
|
overwrite:bool, Default is False, assign it to True if the text should overwrite the whole existing text. Using this argument clears all text in an element.
|
|
enter:bool, Assign it to True if the enter key should be pressed after typing all the text, otherwise assign it to False.
|
|
is_terminal:bool, (MANDATORY) You MUST set this to True whenever the target you will type into is a terminal.
|
|
"""
|
|
commands = (
|
|
"import os;"
|
|
"import pyautogui;"
|
|
"import pyperclip;"
|
|
"import subprocess;"
|
|
"import time;"
|
|
)
|
|
|
|
|
|
if self.platform == "linux":
|
|
commands += (
|
|
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
|
|
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
|
|
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
|
|
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
|
|
)
|
|
|
|
x, y = None, None
|
|
if element_description is not None:
|
|
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
|
commands += (
|
|
f"pyautogui.click({x}, {y}, clicks=2);"
|
|
f"time.sleep(1.0);"
|
|
f"pyautogui.click({x}, {y});"
|
|
)
|
|
|
|
if overwrite:
|
|
commands += self._clear_all(is_terminal=is_terminal)
|
|
|
|
commands += self._type(text=text, is_terminal=is_terminal)
|
|
|
|
if enter:
|
|
commands += "pyautogui.press('enter');"
|
|
|
|
if element_description is not None:
|
|
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
|
else:
|
|
action = {"function": "type", "args": {"text": text}}
|
|
return (commands, action)
|
|
|
|
@agent_action
|
|
def drag_and_drop(
|
|
self, starting_description: str, ending_description: str, hold_keys: List = []
|
|
):
|
|
"""Drag from the starting description to the ending description
|
|
Args:
|
|
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
|
|
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
|
|
hold_keys:List list of keys to hold while dragging
|
|
"""
|
|
x1, y1 = self.grounder_agent.generate_coords(starting_description, self.obs)
|
|
x2, y2 = self.grounder_agent.generate_coords(ending_description, self.obs)
|
|
|
|
command = "import pyautogui; "
|
|
|
|
command += f"pyautogui.moveTo({x1}, {y1}); "
|
|
# TODO: specified duration?
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyDown({repr(k)}); "
|
|
command += f"pyautogui.dragTo({x2}, {y2}, duration=3., button='left'); pyautogui.mouseUp(); "
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyUp({repr(k)}); "
|
|
|
|
# Return pyautoguicode to drag and drop the elements
|
|
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
|
return (command, action)
|
|
|
|
@agent_action
|
|
def highlight_text_span(
|
|
self,
|
|
starting_phrase: str,
|
|
ending_phrase: str,
|
|
button: str = "left",
|
|
text: Optional[str|None] = None
|
|
):
|
|
"""Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.
|
|
Args:
|
|
starting_phrase: str, the sequence of words that marks the beginning of the text span. Provide a unique sequence of 5 to 10 words.
|
|
ending_phrase: str, the sequence of words that marks the end of the text span. Provide a unique sequence of 5 to 10 words.
|
|
button:str, the button to use to highlight the text span. Defaults to "left". Can be "left", "right", or "middle".
|
|
text: str | None, The text to overwrite the highlighted span with. Providing text here ensures the replacement happens immediately after selection, preventing focus loss.
|
|
"""
|
|
x1, y1 = self.generate_text_coords(
|
|
starting_phrase, self.obs, alignment="start"
|
|
)
|
|
x2, y2 = self.generate_text_coords(
|
|
ending_phrase, self.obs, alignment="end"
|
|
)
|
|
|
|
command = "import pyautogui; import time;"
|
|
command += f"pyautogui.moveTo({x1}, {y1}); "
|
|
# Click in advance to simulate selecting the text box.
|
|
command += (
|
|
f"pyautogui.click({x1}, {y1}, clicks=2);"
|
|
f"time.sleep(1.0); pyautogui.click({x1}, {y1}); time.sleep(1.0);"
|
|
)
|
|
command += f"pyautogui.dragTo({x2}, {y2}, duration=5., button='{button}'); time.sleep(0.5); pyautogui.mouseUp(); "
|
|
|
|
if text:
|
|
if self.platform == "linux":
|
|
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
|
|
|
command += (
|
|
"original_clipboard = pyperclip.paste();"
|
|
f"pyperclip.copy({repr(text)});"
|
|
)
|
|
command += self._paste(is_terminal=False)
|
|
command += "pyperclip.copy(original_clipboard);"
|
|
|
|
# Return pyautoguicode to drag and drop the elements
|
|
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
|
return (command, action)
|
|
|
|
@agent_action
|
|
def locate_cursor(
|
|
self,
|
|
phrase: str,
|
|
start_or_end: str="start",
|
|
text: Optional[str|None] = None
|
|
):
|
|
"""Click at the beginning or end of a specific text phrase to precisely control cursor positioning. Please prefer using the "click" action in general situations, and use this action only in text-intensive software such as libreoffice_writer, impress, etc.
|
|
|
|
Args:
|
|
phrase: str, The text phrase where you want to position the cursor. Provide a unique sequence of 5 to 10 words. Do NOT use single words unless the total text is extremely short.
|
|
start_or_end: str, Whether to click at the "start" (beginning) or "end" (trailing edge) of the identified text phrase. Use "start" to position before the text, "end" to position after it.
|
|
text: str | None, The text to enter immediately after positioning the cursor. Use this parameter instead of a separate 'type' action to ensure precise input.
|
|
"""
|
|
x, y = self.generate_text_coords(
|
|
phrase, self.obs, alignment=start_or_end
|
|
)
|
|
command = (
|
|
"import pyautogui;"
|
|
"import time;"
|
|
"import subprocess;"
|
|
"import pyperclip;"
|
|
f"pyautogui.click({x}, {y}, button='left', clicks=2);"
|
|
"time.sleep(1.0);"
|
|
f"pyautogui.click({x}, {y}, button='left');"
|
|
)
|
|
if text:
|
|
if self.platform == "linux":
|
|
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
|
|
|
command += self._type(text=text, is_terminal=False)
|
|
|
|
if text:
|
|
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
|
else:
|
|
action = {"function": "click", "args": {"x": x, "y": y, "clicks": 1, "button": "left"}}
|
|
return (command, action)
|
|
|
|
|
|
@agent_action
|
|
def call_code_agent(self, task: str):
|
|
"""Calls the code agent to execute a well-defined, self-contained goal that can be completed with code.
|
|
|
|
Args:
|
|
task: str, A specific, self-contained goal that the code agent can work on until completion.
|
|
|
|
**🚨 CRITICAL GUIDELINES:**
|
|
|
|
**Decompose the Main Objective into Logical Goals:**
|
|
- You **MUST** break down the overall mission into distinct, logical goals or stages.
|
|
- Your role is to define *what* needs to be done for a specific stage. The code agent's role is to figure out *how* to do it with code.
|
|
- Pass only one logical goal at a time. The `task` parameter is **REQUIRED**.
|
|
|
|
**Define a Self-Contained, Continuous Goal:**
|
|
- The `task` you provide should be a single, continuous goal. The code agent is capable of handling a multi-step process internally (e.g., opening a file, processing its data, and then saving it) to achieve this one goal.
|
|
- **Crucially, do not pass a task that combines multiple distinct objectives.** For example, instead of passing "Analyze the sales data, AND email the result," you should first pass the self-contained goal: "Analyze the sales data." After that goal is complete, you can proceed with the next logical goal (e.g., emailing the result) in a subsequent step.
|
|
- **If unsure, err on the side of caution.** If a task feels like it has two separate parts, break it down and pass only the first part.
|
|
- Your instruction must describe the desired end-state, NOT the recipe to get there. Do not specify any solution!
|
|
|
|
**Goal Purity is Essential:**
|
|
- **NEVER** rephrase, paraphrase, or modify the subtask instruction you have decided on. Pass the exact, original wording of the subtask to prevent instruction drift and hallucination.
|
|
|
|
Use this for tasks that can be fully accomplished through code execution, particularly for:
|
|
- Spreadsheet applications: data processing, filtering, sorting, calculations, formulas, data analysis
|
|
- Document editors: text processing, content editing, formatting, document manipulation
|
|
- Code editors: code editing, file processing, text manipulation, configuration
|
|
- Data analysis tools: statistical analysis, data transformation, reporting
|
|
- File management: bulk operations, file processing, content extraction
|
|
- System utilities: configuration, setup, automation
|
|
"""
|
|
logger.info("=" * 50)
|
|
logger.info("ACI: Calling Code Agent")
|
|
logger.info("=" * 50)
|
|
task_to_execute = task
|
|
logger.info(f"Executing SUBTASK: {task_to_execute}")
|
|
|
|
print("obs keys: ", self.obs.keys())
|
|
screenshot = self.obs.get("screenshot", "") if self.obs else ""
|
|
logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}")
|
|
|
|
logger.info("Executing code agent...")
|
|
|
|
result = self.coder_agent.execute(
|
|
task_to_execute, screenshot, self.env.controller
|
|
)
|
|
|
|
# Store the result for the worker to access
|
|
self.last_code_agent_result = result
|
|
|
|
logger.info("Code agent execution completed")
|
|
logger.info(f"Result - Completion reason: {result['completion_reason']}")
|
|
logger.info(f"Steps executed: {result['steps_executed']}")
|
|
logger.info(f"Summary: {result['summary']}")
|
|
|
|
logger.info("=" * 50)
|
|
logger.info("GROUNDING AGENT: Code Agent Call Finished")
|
|
logger.info("=" * 50)
|
|
|
|
action = {"function": "call_code_agent", "args": {"query": task, "result": True if result["completion_reason"] == "DONE" else False}}
|
|
# Return code to be executed in the environment
|
|
return ("import time; time.sleep(2.222)", action)
|
|
|
|
@agent_action
|
|
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
|
"""Scroll the element in the specified direction
|
|
Args:
|
|
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
|
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
|
shift:bool, whether to use shift+scroll for horizontal scrolling
|
|
"""
|
|
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
|
action = {"function": "scroll", "args": {"x": x, "y": y, "clicks": clicks, "shift": shift}}
|
|
if shift:
|
|
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", action)
|
|
else:
|
|
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", action)
|
|
|
|
@agent_action
|
|
def hotkey(self, keys: List):
|
|
"""Press a hotkey combination (can press a single key as well)
|
|
Args:
|
|
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
|
|
"""
|
|
# add quotes around the keys
|
|
keys = [f"'{key}'" for key in keys]
|
|
keys_string = " ".join(keys)
|
|
action = {"function": "key", "args": {"keys": keys_string}}
|
|
return (f"import pyautogui; pyautogui.hotkey({', '.join(keys)});", action)
|
|
|
|
@agent_action
|
|
def hold_and_press(self, hold_keys: List, press_keys: List):
|
|
"""Hold a list of keys and press a list of keys
|
|
Args:
|
|
hold_keys:List, list of keys to hold
|
|
press_keys:List, list of keys to press in a sequence
|
|
"""
|
|
|
|
press_keys_str = "[" + ", ".join([f"'{key}'" for key in press_keys]) + "]"
|
|
command = "import pyautogui; "
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyDown({repr(k)}); "
|
|
command += f"pyautogui.press({press_keys_str}); "
|
|
for k in hold_keys:
|
|
command += f"pyautogui.keyUp({repr(k)}); "
|
|
|
|
hold_keys_string = " ".join(hold_keys)
|
|
press_keys_string = " ".join(press_keys)
|
|
action = {"function": "key", "args": {"keys": hold_keys_string + ";" + press_keys_string}}
|
|
return (command, action)
|
|
|
|
@agent_action
|
|
def wait(self, time: float):
|
|
"""Wait for a specified amount of time
|
|
Args:
|
|
time:float, the amount of time to wait in seconds
|
|
"""
|
|
return (f"""import time; time.sleep({time});""", {"function": "wait", "args": {}})
|
|
|
|
@agent_action
|
|
def done(
|
|
self,
|
|
):
|
|
"""
|
|
End the current task with a success. Use this when you believe the entire task has been fully completed. You must ensure all visual information aligns with the user's true intent.
|
|
"""
|
|
return ("""DONE""", {"function": "done", "args": {}})
|
|
|
|
@agent_action
|
|
def fail(self):
|
|
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete."""
|
|
return ("""FAIL""", {"function": "fail", "args": {}})
|
|
|
|
@agent_action
|
|
def call_search_agent(
|
|
self,
|
|
query: str,
|
|
):
|
|
"""
|
|
Calls a specialized 'Searcher Agent' to find a detailed, step-by-step tutorial on the internet for a specific GUI action.
|
|
Args:
|
|
query:str, the search phrase or question for the tutorial. The formulation of this query is critical for success and must follow the guidelines below.
|
|
|
|
**Query Formulation Guidelines:**
|
|
|
|
Your query must be a well-defined question targeting a **single, specific action** within a **specific application**. To get the best results, adhere to these rules:
|
|
|
|
1. **Start with "How to":** Your query must begin with the phrase "How to" to frame it as a request for instructions.
|
|
2. **Include the Application Name:** Always specify the name of the software you are working in (e.g., "GIMP", "Google Chrome", "Libreoffice Writer").
|
|
3. **Focus on a Single Intent:** The query should represent one clear goal. Do not combine multiple steps or tasks into one query.
|
|
4. **Be Specific, Not Abstract:** Ask a concrete question. Avoid repeating the user's high-level or abstract instructions.
|
|
5. **Decompose Complex Tasks:** If the user's overall instruction involves multiple actions (e.g., "download a file and then email it"), and you are stuck on one part, search *only for that specific part*.
|
|
|
|
**Examples:**
|
|
|
|
* **User's Overall Instruction:** "Please help me download my latest bank statement and then send it to my accountant."
|
|
* **Correct Query (if stuck on downloading):** "How to download a bank statement from the Bank of America website?"
|
|
* **Correct Query (if stuck on attaching a file):** "How to attach a file to an email in Gmail?"
|
|
* **Incorrect Query:** "Download my bank statement and email it to my accountant" *(This query is too broad, contains multiple sub-tasks, and does not start with "How to".)*
|
|
"""
|
|
logger.info("=" * 50)
|
|
logger.info(f"ACI: Calling Search Agent(query={query})")
|
|
logger.info("=" * 50)
|
|
self.searcher_agent.result_dir = self.result_dir
|
|
result = self.searcher_agent.search(query=query, main_obs=self.obs)
|
|
self.last_search_agent_result = result
|
|
if result["completion_reason"] == "DONE":
|
|
self.tutorials.append(result["final_answer"])
|
|
action = {"function": "call_search_agent", "args": {"query": query, "result": True if result["completion_reason"] == "DONE" else False}}
|
|
return ("import time; time.sleep(2.222)", action)
|
|
|