EvoCUA Update (2025.01.05) (#412)

* evocua init

* setup max_token

* evocua update

---------

Co-authored-by: xuetaofeng <xuetaofeng@meituan.com>
Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
This commit is contained in:
蘑菇先生
2026-01-05 16:14:53 +08:00
committed by GitHub
parent 439e178a2e
commit 5ef8bdfa35
3 changed files with 110 additions and 24 deletions

View File

@@ -317,7 +317,26 @@ Previous actions:
args = tool_call["arguments"]
action = args["action"]
if action == "left_click":
def _clean_keys(raw_keys):
keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
cleaned_keys = []
for key in keys:
if isinstance(key, str):
if key.startswith("keys=["):
key = key[6:]
if key.endswith("]"):
key = key[:-1]
if key.startswith("['") or key.startswith('["'):
key = key[2:] if len(key) > 2 else key
if key.endswith("']") or key.endswith('"]'):
key = key[:-2] if len(key) > 2 else key
key = key.strip()
cleaned_keys.append(key)
else:
cleaned_keys.append(key)
return cleaned_keys
if action == "left_click" or action == "click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
@@ -355,6 +374,16 @@ Previous actions:
else:
pyautogui_code.append("pyautogui.doubleClick()")
elif action == "triple_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.tripleClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.tripleClick()")
elif action == "type":
text = args.get("text", "")
@@ -383,24 +412,7 @@ Previous actions:
elif action == "key":
keys = args.get("keys", [])
if isinstance(keys, list):
cleaned_keys = []
for key in keys:
if isinstance(key, str):
if key.startswith("keys=["):
key = key[6:]
if key.endswith("]"):
key = key[:-1]
if key.startswith("['") or key.startswith('["'):
key = key[2:] if len(key) > 2 else key
if key.endswith("']") or key.endswith('"]'):
key = key[:-2] if len(key) > 2 else key
key = key.strip()
cleaned_keys.append(key)
else:
cleaned_keys.append(key)
keys = cleaned_keys
keys = _clean_keys(args.get("keys", []))
keys_str = ", ".join([f"'{key}'" for key in keys])
if len(keys) > 1:
@@ -408,6 +420,16 @@ Previous actions:
else:
pyautogui_code.append(f"pyautogui.press({keys_str})")
elif action == "key_down":
keys = _clean_keys(args.get("keys", []))
for k in keys:
pyautogui_code.append(f"pyautogui.keyDown('{k}')")
elif action == "key_up":
keys = _clean_keys(args.get("keys", []))
for k in reversed(keys):
pyautogui_code.append(f"pyautogui.keyUp('{k}')")
elif action == "scroll":
pixels = args.get("pixels", 0)
pyautogui_code.append(f"pyautogui.scroll({pixels})")
@@ -416,7 +438,15 @@ Previous actions:
pyautogui_code.append("WAIT")
elif action == "terminate":
pyautogui_code.append("DONE")
# Termination should respect status:
# - success -> DONE
# - failure -> FAIL
# Backward compatible: missing status defaults to success.
status = args.get("status", "success")
if str(status).lower() == "failure":
pyautogui_code.append("FAIL")
else:
pyautogui_code.append("DONE")
elif action == "mouse_move":
if "coordinate" in args:
@@ -481,7 +511,11 @@ Previous actions:
process_tool_call("\n".join(current_tool_call))
if not low_level_instruction and len(pyautogui_code) > 0:
action_type = pyautogui_code[0].split(".", 1)[1].split("(", 1)[0]
first_action = pyautogui_code[0]
if "." in first_action:
action_type = first_action.split(".", 1)[1].split("(", 1)[0]
else:
action_type = first_action.lower()
low_level_instruction = f"Performing {action_type} action"
return low_level_instruction, pyautogui_code

View File

@@ -60,6 +60,8 @@ S1_ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
# S2 Prompts
S2_ACTION_DESCRIPTION = """
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
* `key_down`: Press and HOLD the specified key(s) down in order (no release). Use this for stateful holds like holding Shift while clicking.
* `key_up`: Release the specified key(s) in reverse order.
* `type`: Type a string of text on the keyboard.
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
@@ -67,7 +69,7 @@ S2_ACTION_DESCRIPTION = """
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `scroll`: Performs a scroll of the mouse scroll wheel.
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
* `wait`: Wait specified seconds for the change to happen.
@@ -76,7 +78,7 @@ S2_ACTION_DESCRIPTION = """
"""
S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots.
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
* This is an interface to a desktop GUI. You must click on desktop icons to start applications.
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
{resolution_info}
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
@@ -122,7 +124,8 @@ def build_s2_tools_def(description_prompt):
"action": {
"description": S2_ACTION_DESCRIPTION,
"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag",
"right_click", "middle_click", "double_click", "scroll", "wait", "terminate"],
"right_click", "middle_click", "double_click", "triple_click", "scroll",
"wait", "terminate", "key_down", "key_up"],
"type": "string"
},
"keys": {"description": "Required only by `action=key`.", "type": "array"},