add_os_symphony (#399)
This commit is contained in:
575
mm_agents/os_symphony/agents/os_aci.py
Executable file
575
mm_agents/os_symphony/agents/os_aci.py
Executable file
@@ -0,0 +1,575 @@
|
||||
import re
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
|
||||
from mm_agents.os_symphony.memory.procedural_memory import PROCEDURAL_MEMORY
|
||||
from mm_agents.os_symphony.core.mllm import LMMAgent
|
||||
from mm_agents.os_symphony.utils.common_utils import call_llm_safe
|
||||
from mm_agents.os_symphony.agents.coder_agent import CoderAgent
|
||||
from mm_agents.os_symphony.agents.grounder_agent import GrounderAgent
|
||||
from mm_agents.os_symphony.agents.searcher_agent import SearcherAgent
|
||||
import logging
|
||||
from mm_agents.os_symphony.agents.ocr import OCRProcessor
|
||||
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
# Agent action decorator
|
||||
def agent_action(func):
|
||||
func.is_agent_action = True
|
||||
return func
|
||||
|
||||
# GrounderAgent primitives are parameterized by description, and coordinate generation uses a pretrained grounding model
|
||||
class OSACI:
|
||||
def __init__(
|
||||
self,
|
||||
env,
|
||||
search_env,
|
||||
platform: str,
|
||||
client_password: str,
|
||||
engine_params_for_ocr: Dict,
|
||||
engine_params_for_grounder: Dict,
|
||||
engine_params_for_coder: Dict,
|
||||
engine_params_for_searcher: Dict,
|
||||
screen_width: int = 1920,
|
||||
screen_height: int = 1080
|
||||
):
|
||||
|
||||
self.env = env
|
||||
self.platform = platform
|
||||
self.client_password = client_password
|
||||
|
||||
self.result_dir = ""
|
||||
|
||||
self.grounder_agent = GrounderAgent(engine_params=engine_params_for_grounder, screen_width=screen_width, screen_height=screen_height)
|
||||
|
||||
# Configure text grounding agent
|
||||
self.ocr_processor = OCRProcessor()
|
||||
self.text_span_agent = LMMAgent(
|
||||
engine_params=engine_params_for_ocr,
|
||||
system_prompt=PROCEDURAL_MEMORY.PHRASE_TO_WORD_COORDS_PROMPT,
|
||||
)
|
||||
|
||||
# Configure code agent
|
||||
self.coder_agent = CoderAgent(
|
||||
engine_params=engine_params_for_coder,
|
||||
platform=self.platform,
|
||||
client_password=client_password
|
||||
)
|
||||
|
||||
# Configure search agent
|
||||
self.searcher_agent = SearcherAgent.create(
|
||||
engine_params=engine_params_for_searcher,
|
||||
search_env=search_env,
|
||||
grounder_agent=self.grounder_agent,
|
||||
platform=self.platform,
|
||||
client_password=self.client_password
|
||||
)
|
||||
|
||||
# Store task instruction for code agent
|
||||
self.current_task_instruction = None
|
||||
self.last_code_agent_result = None
|
||||
self.last_search_agent_result = None
|
||||
self.notes: List[str] = []
|
||||
# Tutorial should be a global info, not a local context, so how to add it to the global info
|
||||
self.tutorials = []
|
||||
|
||||
|
||||
def assign_screenshot(self, obs):
|
||||
self.obs = obs
|
||||
|
||||
# Given the state and worker's text phrase, generate the coords of the first/last word in the phrase
|
||||
def generate_text_coords(
|
||||
self, phrase: str, obs: Dict, alignment: str = ""
|
||||
) -> List[int]:
|
||||
|
||||
screenshot, global_offset_x, global_offset_y= obs["screenshot"], 0, 0
|
||||
|
||||
ocr_table, ocr_elements = self.ocr_processor.get_ocr_elements(screenshot, "easyocr")
|
||||
|
||||
alignment_prompt = ""
|
||||
if alignment == "start":
|
||||
alignment_prompt = "**Important**: Output the word id of the FIRST word in the provided phrase.\n"
|
||||
elif alignment == "end":
|
||||
alignment_prompt = "**Important**: Output the word id of the LAST word in the provided phrase.\n"
|
||||
|
||||
# Load LLM prompt
|
||||
self.text_span_agent.reset()
|
||||
self.text_span_agent.add_message(
|
||||
alignment_prompt + "Phrase: " + phrase + "\n" + ocr_table, role="user"
|
||||
)
|
||||
self.text_span_agent.add_message(
|
||||
"Screenshot:\n", image_content=screenshot, role="user"
|
||||
)
|
||||
|
||||
# Obtain the target element
|
||||
response = call_llm_safe(self.text_span_agent)
|
||||
print("TEXT SPAN AGENT RESPONSE:", response)
|
||||
numericals = re.findall(r"\d+", response)
|
||||
if len(numericals) > 0:
|
||||
text_id = int(numericals[-1])
|
||||
else:
|
||||
text_id = 0
|
||||
elem = ocr_elements[text_id]
|
||||
|
||||
# Compute the element coordinates
|
||||
# Note: 0.1 * elem["height"] is used to adjust coordinates to select the last character more precisely.
|
||||
if alignment == "start":
|
||||
coords = [elem["left"], elem["top"] + (elem["height"] // 2)]
|
||||
elif alignment == "end":
|
||||
coords = [elem["left"] + elem["width"] + 0.15 * elem["height"], elem["top"] + (elem["height"] // 2)]
|
||||
|
||||
print(f'[OCR] output coordinates: {[coords[0] + global_offset_x, coords[1] + global_offset_y]}')
|
||||
return [int(coords[0] + global_offset_x), int(coords[1] + global_offset_y)]
|
||||
|
||||
def set_task_instruction(self, task_instruction: str):
|
||||
"""Set the current task instruction for the code agent."""
|
||||
self.current_task_instruction = task_instruction
|
||||
|
||||
@agent_action
|
||||
def click(
|
||||
self,
|
||||
element_description: str,
|
||||
num_clicks: int = 1,
|
||||
button_type: str = "left",
|
||||
hold_keys: List = []
|
||||
):
|
||||
"""Click on the element
|
||||
Args:
|
||||
element_description:str, a detailed descriptions of which element to click on. This description needs to be VERY unambiguous. If the page contains many similar elements, ensure the description uniquely identifies the target element.
|
||||
num_clicks:int, number of times to click the element
|
||||
button_type:str, which mouse button to press can be "left", "middle", or "right"
|
||||
hold_keys:List, list of keys to hold while clicking
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
|
||||
command = "import pyautogui; "
|
||||
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"""import pyautogui; pyautogui.click({x}, {y}, clicks={num_clicks}, button={repr(button_type)}); """
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
# Return pyautoguicode to click on the element
|
||||
|
||||
action = {"function": "click", "args": {"x": x, "y": y, "button": button_type, "clicks": num_clicks}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def open(self, app_or_filename: str):
|
||||
"""Open any application or file with name app_or_filename. Use this action to open applications or files on the desktop, do not open manually.
|
||||
Args:
|
||||
app_or_filename:str, the name of the application or filename to open
|
||||
|
||||
**Important**:
|
||||
Provide only the name of the application or file. Do not include the full path (e.g., "/home/user/Desktop/my_report.docx"). The function works by searching for the name, not by accessing a file path directly.
|
||||
"""
|
||||
action = {"function": "open", "args": {"name": app_or_filename}}
|
||||
if self.platform == "linux":
|
||||
return (f"import pyautogui; pyautogui.hotkey('win'); time.sleep(1.0); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.hotkey('enter'); time.sleep(1.0)", action)
|
||||
elif self.platform == "macos":
|
||||
return (f"import pyautogui; import time; pyautogui.hotkey('command', 'space', interval=0.5); pyautogui.typewrite({repr(app_or_filename)}); pyautogui.press('enter'); time.sleep(1.0)", action)
|
||||
elif self.platform == "windows":
|
||||
return (f"import pyautogui; import time; pyautogui.hotkey('win'); time.sleep(0.5); pyautogui.write({repr(app_or_filename)}); time.sleep(1.0); pyautogui.press('enter'); time.sleep(0.5)", action)
|
||||
else:
|
||||
assert (
|
||||
False
|
||||
), f"Unsupported platform: {self.platform}. Supported platforms are: darwin, linux, windows."
|
||||
|
||||
def _paste(self, is_terminal):
|
||||
if self.platform == 'macos':
|
||||
return "pyautogui.hotkey('command', 'v');"
|
||||
|
||||
elif self.platform == 'linux':
|
||||
if is_terminal:
|
||||
return "pyautogui.hotkey('ctrl', 'shift', 'v');"
|
||||
else:
|
||||
return "pyautogui.hotkey('ctrl', 'v');"
|
||||
|
||||
elif self.platform == 'windows':
|
||||
return "pyautogui.hotkey('ctrl', 'v');"
|
||||
|
||||
return ""
|
||||
|
||||
def _clear_all(self, is_terminal):
|
||||
"""
|
||||
Clean the content of current line
|
||||
"""
|
||||
# common apps in GUI
|
||||
if not is_terminal:
|
||||
if self.platform == 'macos':
|
||||
# macOS GUI: Command + A -> Backspace
|
||||
return "pyautogui.hotkey('command', 'a'); pyautogui.press('backspace');"
|
||||
else:
|
||||
# Windows/Linux GUI: Ctrl + A -> Backspace
|
||||
return "pyautogui.hotkey('ctrl', 'a'); pyautogui.press('backspace');"
|
||||
|
||||
# terminal
|
||||
else:
|
||||
if self.platform == 'windows':
|
||||
return "pyautogui.press('esc');"
|
||||
else:
|
||||
return "pyautogui.hotkey('ctrl', 'e'); pyautogui.hotkey('ctrl', 'u');"
|
||||
|
||||
def _type(
|
||||
self,
|
||||
text: str,
|
||||
is_terminal: bool
|
||||
):
|
||||
"""
|
||||
use copy and paste to input Chinese, otherwise type normally
|
||||
"""
|
||||
commands = ""
|
||||
has_unicode = any(ord(char) > 127 for char in text)
|
||||
if has_unicode and self.platform != "macos":
|
||||
commands += (
|
||||
"original_clipboard = pyperclip.paste();"
|
||||
f"pyperclip.copy({repr(text)});"
|
||||
"time.sleep(0.1);"
|
||||
)
|
||||
commands += self._paste(is_terminal=is_terminal)
|
||||
commands += "pyperclip.copy(original_clipboard);"
|
||||
else:
|
||||
commands += f"pyautogui.write({repr(text)}, interval=0.1);"
|
||||
|
||||
return commands
|
||||
|
||||
@agent_action
|
||||
def type(
|
||||
self,
|
||||
element_description: str,
|
||||
text: str = "",
|
||||
overwrite: bool = False,
|
||||
enter: bool = False,
|
||||
is_terminal = False
|
||||
):
|
||||
"""Type text/unicode into a specific element
|
||||
Args:
|
||||
element_description: str, a detailed description of which element to enter text in. If provided, the agent will click on this element before typing.
|
||||
text:str, the text to type
|
||||
overwrite:bool, Default is False, assign it to True if the text should overwrite the whole existing text. Using this argument clears all text in an element.
|
||||
enter:bool, Assign it to True if the enter key should be pressed after typing all the text, otherwise assign it to False.
|
||||
is_terminal:bool, (MANDATORY) You MUST set this to True whenever the target you will type into is a terminal.
|
||||
"""
|
||||
commands = (
|
||||
"import os;"
|
||||
"import pyautogui;"
|
||||
"import pyperclip;"
|
||||
"import subprocess;"
|
||||
"import time;"
|
||||
)
|
||||
|
||||
|
||||
if self.platform == "linux":
|
||||
commands += (
|
||||
"p_http = os.environ.get('http_proxy') or os.environ.get('HTTP_PROXY');"
|
||||
"p_https = os.environ.get('https_proxy') or os.environ.get('HTTPS_PROXY');"
|
||||
"proxy_prefix = (f'http_proxy={p_http} ' if p_http else '') + (f'https_proxy={p_https} ' if p_https else '');"
|
||||
f"subprocess.run(f'echo \"{self.client_password}\" | sudo -S {{proxy_prefix}}apt-get install -y xclip xsel', shell=True, check=True);"
|
||||
)
|
||||
|
||||
x, y = None, None
|
||||
if element_description is not None:
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
commands += (
|
||||
f"pyautogui.click({x}, {y}, clicks=2);"
|
||||
f"time.sleep(1.0);"
|
||||
f"pyautogui.click({x}, {y});"
|
||||
)
|
||||
|
||||
if overwrite:
|
||||
commands += self._clear_all(is_terminal=is_terminal)
|
||||
|
||||
commands += self._type(text=text, is_terminal=is_terminal)
|
||||
|
||||
if enter:
|
||||
commands += "pyautogui.press('enter');"
|
||||
|
||||
if element_description is not None:
|
||||
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
||||
else:
|
||||
action = {"function": "type", "args": {"text": text}}
|
||||
return (commands, action)
|
||||
|
||||
@agent_action
|
||||
def drag_and_drop(
|
||||
self, starting_description: str, ending_description: str, hold_keys: List = []
|
||||
):
|
||||
"""Drag from the starting description to the ending description
|
||||
Args:
|
||||
starting_description:str, a very detailed description of where to start the drag action. This description should be at least a full sentence.
|
||||
ending_description:str, a very detailed description of where to end the drag action. This description should be at least a full sentence.
|
||||
hold_keys:List list of keys to hold while dragging
|
||||
"""
|
||||
x1, y1 = self.grounder_agent.generate_coords(starting_description, self.obs)
|
||||
x2, y2 = self.grounder_agent.generate_coords(ending_description, self.obs)
|
||||
|
||||
command = "import pyautogui; "
|
||||
|
||||
command += f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# TODO: specified duration?
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=3., button='left'); pyautogui.mouseUp(); "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
|
||||
# Return pyautoguicode to drag and drop the elements
|
||||
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def highlight_text_span(
|
||||
self,
|
||||
starting_phrase: str,
|
||||
ending_phrase: str,
|
||||
button: str = "left",
|
||||
text: Optional[str|None] = None
|
||||
):
|
||||
"""Highlight a text span between a provided starting phrase and ending phrase. Use this to highlight words, lines, and paragraphs.
|
||||
Args:
|
||||
starting_phrase: str, the sequence of words that marks the beginning of the text span. Provide a unique sequence of 5 to 10 words.
|
||||
ending_phrase: str, the sequence of words that marks the end of the text span. Provide a unique sequence of 5 to 10 words.
|
||||
button:str, the button to use to highlight the text span. Defaults to "left". Can be "left", "right", or "middle".
|
||||
text: str | None, The text to overwrite the highlighted span with. Providing text here ensures the replacement happens immediately after selection, preventing focus loss.
|
||||
"""
|
||||
x1, y1 = self.generate_text_coords(
|
||||
starting_phrase, self.obs, alignment="start"
|
||||
)
|
||||
x2, y2 = self.generate_text_coords(
|
||||
ending_phrase, self.obs, alignment="end"
|
||||
)
|
||||
|
||||
command = "import pyautogui; import time;"
|
||||
command += f"pyautogui.moveTo({x1}, {y1}); "
|
||||
# Click in advance to simulate selecting the text box.
|
||||
command += (
|
||||
f"pyautogui.click({x1}, {y1}, clicks=2);"
|
||||
f"time.sleep(1.0); pyautogui.click({x1}, {y1}); time.sleep(1.0);"
|
||||
)
|
||||
command += f"pyautogui.dragTo({x2}, {y2}, duration=5., button='{button}'); time.sleep(0.5); pyautogui.mouseUp(); "
|
||||
|
||||
if text:
|
||||
if self.platform == "linux":
|
||||
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
||||
|
||||
command += (
|
||||
"original_clipboard = pyperclip.paste();"
|
||||
f"pyperclip.copy({repr(text)});"
|
||||
)
|
||||
command += self._paste(is_terminal=False)
|
||||
command += "pyperclip.copy(original_clipboard);"
|
||||
|
||||
# Return pyautoguicode to drag and drop the elements
|
||||
action = {"function": "drag", "args": {"x1": x1, "y1": y1, "x2": x2, "y2": y2}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def locate_cursor(
|
||||
self,
|
||||
phrase: str,
|
||||
start_or_end: str="start",
|
||||
text: Optional[str|None] = None
|
||||
):
|
||||
"""Click at the beginning or end of a specific text phrase to precisely control cursor positioning. Please prefer using the "click" action in general situations, and use this action only in text-intensive software such as libreoffice_writer, impress, etc.
|
||||
|
||||
Args:
|
||||
phrase: str, The text phrase where you want to position the cursor. Provide a unique sequence of 5 to 10 words. Do NOT use single words unless the total text is extremely short.
|
||||
start_or_end: str, Whether to click at the "start" (beginning) or "end" (trailing edge) of the identified text phrase. Use "start" to position before the text, "end" to position after it.
|
||||
text: str | None, The text to enter immediately after positioning the cursor. Use this parameter instead of a separate 'type' action to ensure precise input.
|
||||
"""
|
||||
x, y = self.generate_text_coords(
|
||||
phrase, self.obs, alignment=start_or_end
|
||||
)
|
||||
command = (
|
||||
"import pyautogui;"
|
||||
"import time;"
|
||||
"import subprocess;"
|
||||
"import pyperclip;"
|
||||
f"pyautogui.click({x}, {y}, button='left', clicks=2);"
|
||||
"time.sleep(1.0);"
|
||||
f"pyautogui.click({x}, {y}, button='left');"
|
||||
)
|
||||
if text:
|
||||
if self.platform == "linux":
|
||||
command += "subprocess.run('echo \"password\" | sudo -S apt-get install -y xclip xsel', shell=True, check=True, env={\"http_proxy\": \"http://10.1.8.5:23128\", \"https_proxy\": \"http://10.1.8.5:23128\"});"
|
||||
|
||||
command += self._type(text=text, is_terminal=False)
|
||||
|
||||
if text:
|
||||
action = {"function": "type", "args": {"x": x, "y": y, "text": text}}
|
||||
else:
|
||||
action = {"function": "click", "args": {"x": x, "y": y, "clicks": 1, "button": "left"}}
|
||||
return (command, action)
|
||||
|
||||
|
||||
@agent_action
|
||||
def call_code_agent(self, task: str):
|
||||
"""Calls the code agent to execute a well-defined, self-contained goal that can be completed with code.
|
||||
|
||||
Args:
|
||||
task: str, A specific, self-contained goal that the code agent can work on until completion.
|
||||
|
||||
**🚨 CRITICAL GUIDELINES:**
|
||||
|
||||
**Decompose the Main Objective into Logical Goals:**
|
||||
- You **MUST** break down the overall mission into distinct, logical goals or stages.
|
||||
- Your role is to define *what* needs to be done for a specific stage. The code agent's role is to figure out *how* to do it with code.
|
||||
- Pass only one logical goal at a time. The `task` parameter is **REQUIRED**.
|
||||
|
||||
**Define a Self-Contained, Continuous Goal:**
|
||||
- The `task` you provide should be a single, continuous goal. The code agent is capable of handling a multi-step process internally (e.g., opening a file, processing its data, and then saving it) to achieve this one goal.
|
||||
- **Crucially, do not pass a task that combines multiple distinct objectives.** For example, instead of passing "Analyze the sales data, AND email the result," you should first pass the self-contained goal: "Analyze the sales data." After that goal is complete, you can proceed with the next logical goal (e.g., emailing the result) in a subsequent step.
|
||||
- **If unsure, err on the side of caution.** If a task feels like it has two separate parts, break it down and pass only the first part.
|
||||
- Your instruction must describe the desired end-state, NOT the recipe to get there. Do not specify any solution!
|
||||
|
||||
**Goal Purity is Essential:**
|
||||
- **NEVER** rephrase, paraphrase, or modify the subtask instruction you have decided on. Pass the exact, original wording of the subtask to prevent instruction drift and hallucination.
|
||||
|
||||
Use this for tasks that can be fully accomplished through code execution, particularly for:
|
||||
- Spreadsheet applications: data processing, filtering, sorting, calculations, formulas, data analysis
|
||||
- Document editors: text processing, content editing, formatting, document manipulation
|
||||
- Code editors: code editing, file processing, text manipulation, configuration
|
||||
- Data analysis tools: statistical analysis, data transformation, reporting
|
||||
- File management: bulk operations, file processing, content extraction
|
||||
- System utilities: configuration, setup, automation
|
||||
"""
|
||||
logger.info("=" * 50)
|
||||
logger.info("ACI: Calling Code Agent")
|
||||
logger.info("=" * 50)
|
||||
task_to_execute = task
|
||||
logger.info(f"Executing SUBTASK: {task_to_execute}")
|
||||
|
||||
print("obs keys: ", self.obs.keys())
|
||||
screenshot = self.obs.get("screenshot", "") if self.obs else ""
|
||||
logger.info(f"Screenshot available: {'Yes' if screenshot else 'No'}")
|
||||
|
||||
logger.info("Executing code agent...")
|
||||
|
||||
result = self.coder_agent.execute(
|
||||
task_to_execute, screenshot, self.env.controller
|
||||
)
|
||||
|
||||
# Store the result for the worker to access
|
||||
self.last_code_agent_result = result
|
||||
|
||||
logger.info("Code agent execution completed")
|
||||
logger.info(f"Result - Completion reason: {result['completion_reason']}")
|
||||
logger.info(f"Steps executed: {result['steps_executed']}")
|
||||
logger.info(f"Summary: {result['summary']}")
|
||||
|
||||
logger.info("=" * 50)
|
||||
logger.info("GROUNDING AGENT: Code Agent Call Finished")
|
||||
logger.info("=" * 50)
|
||||
|
||||
action = {"function": "call_code_agent", "args": {"query": task, "result": True if result["completion_reason"] == "DONE" else False}}
|
||||
# Return code to be executed in the environment
|
||||
return ("import time; time.sleep(2.222)", action)
|
||||
|
||||
@agent_action
|
||||
def scroll(self, element_description: str, clicks: int, shift: bool = False):
|
||||
"""Scroll the element in the specified direction
|
||||
Args:
|
||||
element_description:str, a very detailed description of which element to enter scroll in. This description should be at least a full sentence.
|
||||
clicks:int, the number of clicks to scroll can be positive (up) or negative (down).
|
||||
shift:bool, whether to use shift+scroll for horizontal scrolling
|
||||
"""
|
||||
x, y = self.grounder_agent.generate_coords(element_description, self.obs)
|
||||
action = {"function": "scroll", "args": {"x": x, "y": y, "clicks": clicks, "shift": shift}}
|
||||
if shift:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.hscroll({clicks})", action)
|
||||
else:
|
||||
return (f"import pyautogui; import time; pyautogui.moveTo({x}, {y}); time.sleep(0.5); pyautogui.vscroll({clicks})", action)
|
||||
|
||||
@agent_action
|
||||
def hotkey(self, keys: List):
|
||||
"""Press a hotkey combination (can press a single key as well)
|
||||
Args:
|
||||
keys:List the keys to press in combination in a list format (e.g. ['ctrl', 'c'], ['enter'])
|
||||
"""
|
||||
# add quotes around the keys
|
||||
keys = [f"'{key}'" for key in keys]
|
||||
keys_string = " ".join(keys)
|
||||
action = {"function": "key", "args": {"keys": keys_string}}
|
||||
return (f"import pyautogui; pyautogui.hotkey({', '.join(keys)});", action)
|
||||
|
||||
@agent_action
|
||||
def hold_and_press(self, hold_keys: List, press_keys: List):
|
||||
"""Hold a list of keys and press a list of keys
|
||||
Args:
|
||||
hold_keys:List, list of keys to hold
|
||||
press_keys:List, list of keys to press in a sequence
|
||||
"""
|
||||
|
||||
press_keys_str = "[" + ", ".join([f"'{key}'" for key in press_keys]) + "]"
|
||||
command = "import pyautogui; "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyDown({repr(k)}); "
|
||||
command += f"pyautogui.press({press_keys_str}); "
|
||||
for k in hold_keys:
|
||||
command += f"pyautogui.keyUp({repr(k)}); "
|
||||
|
||||
hold_keys_string = " ".join(hold_keys)
|
||||
press_keys_string = " ".join(press_keys)
|
||||
action = {"function": "key", "args": {"keys": hold_keys_string + ";" + press_keys_string}}
|
||||
return (command, action)
|
||||
|
||||
@agent_action
|
||||
def wait(self, time: float):
|
||||
"""Wait for a specified amount of time
|
||||
Args:
|
||||
time:float, the amount of time to wait in seconds
|
||||
"""
|
||||
return (f"""import time; time.sleep({time});""", {"function": "wait", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def done(
|
||||
self,
|
||||
):
|
||||
"""
|
||||
End the current task with a success. Use this when you believe the entire task has been fully completed. You must ensure all visual information aligns with the user's true intent.
|
||||
"""
|
||||
return ("""DONE""", {"function": "done", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def fail(self):
|
||||
"""End the current task with a failure. Use this when you believe the entire task is impossible to complete."""
|
||||
return ("""FAIL""", {"function": "fail", "args": {}})
|
||||
|
||||
@agent_action
|
||||
def call_search_agent(
|
||||
self,
|
||||
query: str,
|
||||
):
|
||||
"""
|
||||
Calls a specialized 'Searcher Agent' to find a detailed, step-by-step tutorial on the internet for a specific GUI action.
|
||||
Args:
|
||||
query:str, the search phrase or question for the tutorial. The formulation of this query is critical for success and must follow the guidelines below.
|
||||
|
||||
**Query Formulation Guidelines:**
|
||||
|
||||
Your query must be a well-defined question targeting a **single, specific action** within a **specific application**. To get the best results, adhere to these rules:
|
||||
|
||||
1. **Start with "How to":** Your query must begin with the phrase "How to" to frame it as a request for instructions.
|
||||
2. **Include the Application Name:** Always specify the name of the software you are working in (e.g., "GIMP", "Google Chrome", "Libreoffice Writer").
|
||||
3. **Focus on a Single Intent:** The query should represent one clear goal. Do not combine multiple steps or tasks into one query.
|
||||
4. **Be Specific, Not Abstract:** Ask a concrete question. Avoid repeating the user's high-level or abstract instructions.
|
||||
5. **Decompose Complex Tasks:** If the user's overall instruction involves multiple actions (e.g., "download a file and then email it"), and you are stuck on one part, search *only for that specific part*.
|
||||
|
||||
**Examples:**
|
||||
|
||||
* **User's Overall Instruction:** "Please help me download my latest bank statement and then send it to my accountant."
|
||||
* **Correct Query (if stuck on downloading):** "How to download a bank statement from the Bank of America website?"
|
||||
* **Correct Query (if stuck on attaching a file):** "How to attach a file to an email in Gmail?"
|
||||
* **Incorrect Query:** "Download my bank statement and email it to my accountant" *(This query is too broad, contains multiple sub-tasks, and does not start with "How to".)*
|
||||
"""
|
||||
logger.info("=" * 50)
|
||||
logger.info(f"ACI: Calling Search Agent(query={query})")
|
||||
logger.info("=" * 50)
|
||||
self.searcher_agent.result_dir = self.result_dir
|
||||
result = self.searcher_agent.search(query=query, main_obs=self.obs)
|
||||
self.last_search_agent_result = result
|
||||
if result["completion_reason"] == "DONE":
|
||||
self.tutorials.append(result["final_answer"])
|
||||
action = {"function": "call_search_agent", "args": {"query": query, "result": True if result["completion_reason"] == "DONE" else False}}
|
||||
return ("import time; time.sleep(2.222)", action)
|
||||
|
||||
Reference in New Issue
Block a user