Files
sci-gui-agent-benchmark/mm_agents/evocua/evocua_agent.py
蘑菇先生 5ef8bdfa35 EvoCUA Update (2025.01.05) (#412)
* evocua init

* setup max_token

* evocua update

---------

Co-authored-by: xuetaofeng <xuetaofeng@meituan.com>
Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
2026-01-05 16:14:53 +08:00

654 lines
26 KiB
Python

import os
import re
import json
import logging
import backoff
import openai
from typing import Dict, List, Tuple, Optional
from io import BytesIO
from PIL import Image
from mm_agents.evocua.utils import (
process_image,
encode_image,
rewrite_pyautogui_text_inputs,
project_coordinate_to_absolute_scale,
log_messages
)
from mm_agents.evocua.prompts import (
S1_SYSTEM_PROMPT,
S1_INSTRUTION_TEMPLATE,
S1_STEP_TEMPLATE,
S1_ACTION_HISTORY_TEMPLATE,
S2_ACTION_DESCRIPTION,
S2_DESCRIPTION_PROMPT_TEMPLATE,
S2_SYSTEM_PROMPT,
build_s2_tools_def
)
logger = logging.getLogger("desktopenv.evocua")
class EvoCUAAgent:
"""
EvoCUA - A Native GUI agent model for desktop automation.
"""
def __init__(
self,
model: str = "EvoCUA-S2",
max_tokens: int = 32768,
top_p: float = 0.9,
temperature: float = 0.0,
action_space: str = "pyautogui",
observation_type: str = "screenshot",
max_steps: int = 50,
prompt_style: str = "S2", # "S1" or "S2"
max_history_turns: int = 4,
screen_size: Tuple[int, int] = (1920, 1080),
coordinate_type: str = "relative",
password: str = "osworld-public-evaluation",
resize_factor: int = 32,
**kwargs
):
self.model = model
self.max_tokens = max_tokens
self.top_p = top_p
self.temperature = temperature
self.action_space = action_space
self.observation_type = observation_type
self.max_steps = max_steps
self.prompt_style = prompt_style
assert self.prompt_style in ["S1", "S2"], f"Invalid prompt_style: {self.prompt_style}"
self.max_history_turns = max_history_turns
self.screen_size = screen_size
self.coordinate_type = coordinate_type
self.password = password
self.resize_factor = resize_factor
# Action space assertion
assert self.action_space == "pyautogui", f"Invalid action space: {self.action_space}"
assert self.observation_type == "screenshot", f"Invalid observation type: {self.observation_type}"
# State
self.thoughts = []
self.actions = []
self.observations = []
self.responses = []
self.screenshots = [] # Stores encoded string
self.cots = [] # For S1 style history
def reset(self, _logger=None, vm_ip=None):
global logger
if _logger:
logger = _logger
self.thoughts = []
self.actions = []
self.observations = []
self.responses = []
self.screenshots = []
self.cots = []
def predict(self, instruction: str, obs: Dict) -> List:
"""
Main prediction loop.
"""
logger.info(f"========================== {self.model} ===================================")
logger.info(f"Instruction: \n{instruction}")
screenshot_bytes = obs["screenshot"]
try:
original_img = Image.open(BytesIO(screenshot_bytes))
original_width, original_height = original_img.size
except Exception as e:
logger.warning(f"Failed to read screenshot size, falling back to screen_size: {e}")
original_width, original_height = self.screen_size
if self.prompt_style == "S1":
raw_b64 = encode_image(screenshot_bytes)
self.screenshots.append(raw_b64)
return self._predict_s1(instruction, obs, raw_b64)
else:
processed_b64, p_width, p_height = process_image(screenshot_bytes, factor=self.resize_factor)
self.screenshots.append(processed_b64)
return self._predict_s2(
instruction,
obs,
processed_b64,
p_width,
p_height,
original_width,
original_height,
)
def _predict_s2(self, instruction, obs, processed_b64, p_width, p_height, original_width, original_height):
current_step = len(self.actions)
current_history_n = self.max_history_turns
response = None
if self.coordinate_type == "absolute":
resolution_info = f"* The screen's resolution is {p_width}x{p_height}."
else:
resolution_info = "* The screen's resolution is 1000x1000."
description_prompt = S2_DESCRIPTION_PROMPT_TEMPLATE.format(resolution_info=resolution_info)
tools_def = build_s2_tools_def(description_prompt)
system_prompt = S2_SYSTEM_PROMPT.format(tools_xml=json.dumps(tools_def))
# Retry loop for context length
while True:
messages = self._build_s2_messages(
instruction,
processed_b64,
current_step,
current_history_n,
system_prompt
)
try:
response = self.call_llm({
"model": self.model,
"messages": messages,
"max_tokens": self.max_tokens,
"top_p": self.top_p,
"temperature": self.temperature,
})
break
except Exception as e:
# Handle Context Too Large
if self._should_giveup_on_context_error(e) and current_history_n > 0:
current_history_n -= 1
logger.warning(f"Context too large, retrying with history_n={current_history_n}")
else:
logger.error(f"Error in predict: {e}")
break
self.responses.append(response)
low_level_instruction, pyautogui_code = self._parse_response_s2(
response, p_width, p_height, original_width, original_height
)
# new added
current_step = len(self.actions) + 1
first_action = pyautogui_code[0] if pyautogui_code else ""
if current_step >= self.max_steps and str(first_action).upper() not in ("DONE", "FAIL"):
logger.warning(f"Reached maximum steps {self.max_steps}. Forcing termination with FAIL.")
low_level_instruction = "Fail the task because reaching the maximum step limit."
pyautogui_code = ["FAIL"]
logger.info(f"Low level instruction: {low_level_instruction}")
logger.info(f"Pyautogui code: {pyautogui_code}")
self.actions.append(low_level_instruction)
return response, pyautogui_code
def _build_s2_messages(self, instruction, current_img, step, history_n, system_prompt):
messages = [{"role": "system", "content": [{"type": "text", "text": system_prompt}]}]
previous_actions = []
history_start_idx = max(0, step - history_n)
for i in range(history_start_idx):
if i < len(self.actions):
previous_actions.append(f"Step {i+1}: {self.actions[i]}")
previous_actions_str = "\n".join(previous_actions) if previous_actions else "None"
# Add History
history_len = min(history_n, len(self.responses))
if history_len > 0:
hist_responses = self.responses[-history_len:]
hist_imgs = self.screenshots[-history_len-1:-1]
for i in range(history_len):
if i < len(hist_imgs):
screenshot_b64 = hist_imgs[i]
if i == 0:
# First history item: Inject Instruction + Previous Actions Context
img_url = f"data:image/png;base64,{screenshot_b64}"
instruction_prompt = f"""
Please generate the next move according to the UI screenshot, instruction and previous actions.
Instruction: {instruction}
Previous actions:
{previous_actions_str}"""
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": img_url}},
{"type": "text", "text": instruction_prompt}
]
})
else:
img_url = f"data:image/png;base64,{screenshot_b64}"
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": img_url}},
]
})
messages.append({
"role": "assistant",
"content": [{"type": "text", "text": hist_responses[i]}]
})
# Current Turn
# We re-use previous_actions_str logic for the case where history_len == 0
if history_len == 0:
# First turn logic: Include Instruction + Previous Actions
instruction_prompt = f"""
Please generate the next move according to the UI screenshot, instruction and previous actions.
Instruction: {instruction}
Previous actions:
{previous_actions_str}"""
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{current_img}"}},
{"type": "text", "text": instruction_prompt}
]
})
else:
# Subsequent turns logic (context already in first history message): Image Only
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{current_img}"}}
]
})
return messages
def _parse_response_s2(
self,
response: str,
processed_width: int = None,
processed_height: int = None,
original_width: Optional[int] = None,
original_height: Optional[int] = None,
) -> Tuple[str, List[str]]:
"""
Parse LLM response and convert it to low level action and pyautogui code.
"""
# Prefer the real screenshot resolution (passed from predict), fallback to configured screen_size.
if not (original_width and original_height):
original_width, original_height = self.screen_size
low_level_instruction = ""
pyautogui_code: List[str] = []
if response is None or not response.strip():
return low_level_instruction, pyautogui_code
def adjust_coordinates(x: float, y: float) -> Tuple[int, int]:
if not (original_width and original_height):
return int(x), int(y)
if self.coordinate_type == "absolute":
# scale from processed pixels to original
if processed_width and processed_height:
x_scale = original_width / processed_width
y_scale = original_height / processed_height
return int(x * x_scale), int(y * y_scale)
return int(x), int(y)
# relative: scale from 0..999 grid
x_scale = original_width / 999
y_scale = original_height / 999
return int(x * x_scale), int(y * y_scale)
def process_tool_call(json_str: str) -> None:
try:
tool_call = json.loads(json_str)
if tool_call.get("name") == "computer_use":
args = tool_call["arguments"]
action = args["action"]
def _clean_keys(raw_keys):
keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
cleaned_keys = []
for key in keys:
if isinstance(key, str):
if key.startswith("keys=["):
key = key[6:]
if key.endswith("]"):
key = key[:-1]
if key.startswith("['") or key.startswith('["'):
key = key[2:] if len(key) > 2 else key
if key.endswith("']") or key.endswith('"]'):
key = key[:-2] if len(key) > 2 else key
key = key.strip()
cleaned_keys.append(key)
else:
cleaned_keys.append(key)
return cleaned_keys
if action == "left_click" or action == "click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(f"pyautogui.click({adj_x}, {adj_y})")
else:
pyautogui_code.append("pyautogui.click()")
elif action == "right_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.rightClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.rightClick()")
elif action == "middle_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.middleClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.middleClick()")
elif action == "double_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.doubleClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.doubleClick()")
elif action == "triple_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.tripleClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.tripleClick()")
elif action == "type":
text = args.get("text", "")
try:
text = text.encode('latin-1', 'backslashreplace').decode('unicode_escape')
except Exception as e:
logger.error(f"Failed to unescape text: {e}")
logger.info(f"Pyautogui code[before rewrite]: {text}")
result = ""
for char in text:
if char == '\n':
result += "pyautogui.press('enter')\n"
elif char == "'":
result += 'pyautogui.press("\'")\n'
elif char == '\\':
result += "pyautogui.press('\\\\')\n"
elif char == '"':
result += "pyautogui.press('\"')\n"
else:
result += f"pyautogui.press('{char}')\n"
pyautogui_code.append(result)
logger.info(f"Pyautogui code[after rewrite]: {pyautogui_code}")
elif action == "key":
keys = _clean_keys(args.get("keys", []))
keys_str = ", ".join([f"'{key}'" for key in keys])
if len(keys) > 1:
pyautogui_code.append(f"pyautogui.hotkey({keys_str})")
else:
pyautogui_code.append(f"pyautogui.press({keys_str})")
elif action == "key_down":
keys = _clean_keys(args.get("keys", []))
for k in keys:
pyautogui_code.append(f"pyautogui.keyDown('{k}')")
elif action == "key_up":
keys = _clean_keys(args.get("keys", []))
for k in reversed(keys):
pyautogui_code.append(f"pyautogui.keyUp('{k}')")
elif action == "scroll":
pixels = args.get("pixels", 0)
pyautogui_code.append(f"pyautogui.scroll({pixels})")
elif action == "wait":
pyautogui_code.append("WAIT")
elif action == "terminate":
# Termination should respect status:
# - success -> DONE
# - failure -> FAIL
# Backward compatible: missing status defaults to success.
status = args.get("status", "success")
if str(status).lower() == "failure":
pyautogui_code.append("FAIL")
else:
pyautogui_code.append("DONE")
elif action == "mouse_move":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.moveTo({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.moveTo(0, 0)")
elif action == "left_click_drag":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
duration = args.get("duration", 0.5)
pyautogui_code.append(
f"pyautogui.dragTo({adj_x}, {adj_y}, duration={duration})"
)
else:
pyautogui_code.append("pyautogui.dragTo(0, 0)")
except (json.JSONDecodeError, KeyError) as e:
logger.error(f"Failed to parse tool call: {e}")
lines = response.split("\n")
inside_tool_call = False
current_tool_call: List[str] = []
for line in lines:
line = line.strip()
if not line:
continue
if line.lower().startswith(("action:")):
if not low_level_instruction:
low_level_instruction = line.split("Action:")[-1].strip()
continue
if line.startswith("<tool_call>"):
inside_tool_call = True
continue
elif line.startswith("</tool_call>"):
if current_tool_call:
process_tool_call("\n".join(current_tool_call))
current_tool_call = []
inside_tool_call = False
continue
if inside_tool_call:
current_tool_call.append(line)
continue
if line.startswith("{") and line.endswith("}"):
try:
json_obj = json.loads(line)
if "name" in json_obj and "arguments" in json_obj:
process_tool_call(line)
except json.JSONDecodeError:
pass
if current_tool_call:
process_tool_call("\n".join(current_tool_call))
if not low_level_instruction and len(pyautogui_code) > 0:
first_action = pyautogui_code[0]
if "." in first_action:
action_type = first_action.split(".", 1)[1].split("(", 1)[0]
else:
action_type = first_action.lower()
low_level_instruction = f"Performing {action_type} action"
return low_level_instruction, pyautogui_code
def _predict_s1(self, instruction, obs, processed_b64):
messages = [{"role": "system", "content": S1_SYSTEM_PROMPT.format(password=self.password)}]
# Reconstruct History Logic for S1 mode
history_step_texts = []
for i in range(len(self.actions)):
cot = self.cots[i] if i < len(self.cots) else {}
# Step Content string
step_content = S1_STEP_TEMPLATE.format(step_num=i+1) + S1_ACTION_HISTORY_TEMPLATE.format(action=cot.get('action', ''))
if i > len(self.actions) - self.max_history_turns:
# Recent history: Add User(Image) and Assistant(Text)
if i < len(self.screenshots) - 1: # Screenshot exists for this step
img = self.screenshots[i]
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{img}"}}
]
})
messages.append({"role": "assistant", "content": step_content})
else:
# Old history: Collect text
history_step_texts.append(step_content)
# If this is the last step before the recent window, flush collected texts
if i == len(self.actions) - self.max_history_turns:
messages.append({
"role": "assistant",
"content": "\n".join(history_step_texts)
})
# Current
messages.append({
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{processed_b64}"}},
{"type": "text", "text": S1_INSTRUTION_TEMPLATE.format(instruction=instruction)}
]
})
response = self.call_llm({
"model": self.model,
"messages": messages,
"max_tokens": self.max_tokens
})
low_level, codes, cot_data = self._parse_response_s1(response)
self.observations.append(obs)
self.cots.append(cot_data)
self.actions.append(low_level)
self.responses.append(response)
return response, codes
def _parse_response_s1(self, response):
sections = {}
# Simple Regex Parsing
for key, pattern in [
('observation', r'#{1,2}\s*Observation\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)'),
('thought', r'#{1,2}\s*Thought\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)'),
('action', r'#{1,2}\s*Action\s*:?[\n\r]+(.*?)(?=^#{1,2}\s|$)')
]:
m = re.search(pattern, response, re.DOTALL | re.MULTILINE)
if m: sections[key] = m.group(1).strip()
code_blocks = re.findall(r'```(?:code|python)?\s*(.*?)\s*```', response, re.DOTALL | re.IGNORECASE)
code = code_blocks[-1].strip() if code_blocks else "FAIL"
sections['code'] = code
# Post-process code
if "computer.terminate" in code:
final_code = ["DONE"] if "success" in code.lower() else ["FAIL"]
elif "computer.wait" in code:
final_code = ["WAIT"]
else:
# Project coordinates
code = project_coordinate_to_absolute_scale(
code,
self.screen_size[0],
self.screen_size[1],
self.coordinate_type,
self.resize_factor
)
logger.info(f"[rewrite before]: {code}")
final_code = [rewrite_pyautogui_text_inputs(code)]
logger.info(f"[rewrite after]: {final_code}")
return sections.get('action', 'Acting'), final_code, sections
@staticmethod
def _should_giveup_on_context_error(e):
"""对于 context length 相关的错误,立即放弃重试,交给外层处理"""
error_str = str(e)
return "Too Large" in error_str or "context_length_exceeded" in error_str or "413" in error_str
@backoff.on_exception(backoff.constant, Exception, interval=30, max_tries=10, giveup=_should_giveup_on_context_error.__func__)
def call_llm(self, payload):
"""Unified OpenAI-compatible API call"""
# Get env vars
base_url = os.environ.get("OPENAI_BASE_URL", "url-xxx")
api_key = os.environ.get("OPENAI_API_KEY", "sk-xxx")
client = openai.OpenAI(base_url=base_url, api_key=api_key)
messages = payload["messages"]
log_messages(messages, "LLM Request")
params = {
"model": payload["model"],
"messages": messages,
"max_tokens": payload["max_tokens"],
"temperature": self.temperature,
"top_p": self.top_p
}
try:
resp = client.chat.completions.create(**params)
content = resp.choices[0].message.content
logger.info(f"LLM Response:\n{content}")
return content
except Exception as e:
logger.error(f"LLM Call failed: {e}")
raise e