EvoCUA Update (2025.01.05) (#412)
* evocua init * setup max_token * evocua update --------- Co-authored-by: xuetaofeng <xuetaofeng@meituan.com> Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
This commit is contained in:
@@ -317,7 +317,26 @@ Previous actions:
|
||||
args = tool_call["arguments"]
|
||||
action = args["action"]
|
||||
|
||||
if action == "left_click":
|
||||
def _clean_keys(raw_keys):
|
||||
keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
|
||||
cleaned_keys = []
|
||||
for key in keys:
|
||||
if isinstance(key, str):
|
||||
if key.startswith("keys=["):
|
||||
key = key[6:]
|
||||
if key.endswith("]"):
|
||||
key = key[:-1]
|
||||
if key.startswith("['") or key.startswith('["'):
|
||||
key = key[2:] if len(key) > 2 else key
|
||||
if key.endswith("']") or key.endswith('"]'):
|
||||
key = key[:-2] if len(key) > 2 else key
|
||||
key = key.strip()
|
||||
cleaned_keys.append(key)
|
||||
else:
|
||||
cleaned_keys.append(key)
|
||||
return cleaned_keys
|
||||
|
||||
if action == "left_click" or action == "click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
@@ -355,6 +374,16 @@ Previous actions:
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.doubleClick()")
|
||||
|
||||
elif action == "triple_click":
|
||||
if "coordinate" in args:
|
||||
x, y = args["coordinate"]
|
||||
adj_x, adj_y = adjust_coordinates(x, y)
|
||||
pyautogui_code.append(
|
||||
f"pyautogui.tripleClick({adj_x}, {adj_y})"
|
||||
)
|
||||
else:
|
||||
pyautogui_code.append("pyautogui.tripleClick()")
|
||||
|
||||
elif action == "type":
|
||||
text = args.get("text", "")
|
||||
|
||||
@@ -383,24 +412,7 @@ Previous actions:
|
||||
|
||||
|
||||
elif action == "key":
|
||||
keys = args.get("keys", [])
|
||||
if isinstance(keys, list):
|
||||
cleaned_keys = []
|
||||
for key in keys:
|
||||
if isinstance(key, str):
|
||||
if key.startswith("keys=["):
|
||||
key = key[6:]
|
||||
if key.endswith("]"):
|
||||
key = key[:-1]
|
||||
if key.startswith("['") or key.startswith('["'):
|
||||
key = key[2:] if len(key) > 2 else key
|
||||
if key.endswith("']") or key.endswith('"]'):
|
||||
key = key[:-2] if len(key) > 2 else key
|
||||
key = key.strip()
|
||||
cleaned_keys.append(key)
|
||||
else:
|
||||
cleaned_keys.append(key)
|
||||
keys = cleaned_keys
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
|
||||
keys_str = ", ".join([f"'{key}'" for key in keys])
|
||||
if len(keys) > 1:
|
||||
@@ -408,6 +420,16 @@ Previous actions:
|
||||
else:
|
||||
pyautogui_code.append(f"pyautogui.press({keys_str})")
|
||||
|
||||
elif action == "key_down":
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
for k in keys:
|
||||
pyautogui_code.append(f"pyautogui.keyDown('{k}')")
|
||||
|
||||
elif action == "key_up":
|
||||
keys = _clean_keys(args.get("keys", []))
|
||||
for k in reversed(keys):
|
||||
pyautogui_code.append(f"pyautogui.keyUp('{k}')")
|
||||
|
||||
elif action == "scroll":
|
||||
pixels = args.get("pixels", 0)
|
||||
pyautogui_code.append(f"pyautogui.scroll({pixels})")
|
||||
@@ -416,7 +438,15 @@ Previous actions:
|
||||
pyautogui_code.append("WAIT")
|
||||
|
||||
elif action == "terminate":
|
||||
pyautogui_code.append("DONE")
|
||||
# Termination should respect status:
|
||||
# - success -> DONE
|
||||
# - failure -> FAIL
|
||||
# Backward compatible: missing status defaults to success.
|
||||
status = args.get("status", "success")
|
||||
if str(status).lower() == "failure":
|
||||
pyautogui_code.append("FAIL")
|
||||
else:
|
||||
pyautogui_code.append("DONE")
|
||||
|
||||
elif action == "mouse_move":
|
||||
if "coordinate" in args:
|
||||
@@ -481,7 +511,11 @@ Previous actions:
|
||||
process_tool_call("\n".join(current_tool_call))
|
||||
|
||||
if not low_level_instruction and len(pyautogui_code) > 0:
|
||||
action_type = pyautogui_code[0].split(".", 1)[1].split("(", 1)[0]
|
||||
first_action = pyautogui_code[0]
|
||||
if "." in first_action:
|
||||
action_type = first_action.split(".", 1)[1].split("(", 1)[0]
|
||||
else:
|
||||
action_type = first_action.lower()
|
||||
low_level_instruction = f"Performing {action_type} action"
|
||||
|
||||
return low_level_instruction, pyautogui_code
|
||||
|
||||
@@ -60,6 +60,8 @@ S1_ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
|
||||
# S2 Prompts
|
||||
S2_ACTION_DESCRIPTION = """
|
||||
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
|
||||
* `key_down`: Press and HOLD the specified key(s) down in order (no release). Use this for stateful holds like holding Shift while clicking.
|
||||
* `key_up`: Release the specified key(s) in reverse order.
|
||||
* `type`: Type a string of text on the keyboard.
|
||||
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
|
||||
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
@@ -67,7 +69,7 @@ S2_ACTION_DESCRIPTION = """
|
||||
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
|
||||
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
|
||||
* `scroll`: Performs a scroll of the mouse scroll wheel.
|
||||
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
|
||||
* `wait`: Wait specified seconds for the change to happen.
|
||||
@@ -76,7 +78,7 @@ S2_ACTION_DESCRIPTION = """
|
||||
"""
|
||||
|
||||
S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots.
|
||||
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
|
||||
* This is an interface to a desktop GUI. You must click on desktop icons to start applications.
|
||||
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
|
||||
{resolution_info}
|
||||
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
|
||||
@@ -122,7 +124,8 @@ def build_s2_tools_def(description_prompt):
|
||||
"action": {
|
||||
"description": S2_ACTION_DESCRIPTION,
|
||||
"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag",
|
||||
"right_click", "middle_click", "double_click", "scroll", "wait", "terminate"],
|
||||
"right_click", "middle_click", "double_click", "triple_click", "scroll",
|
||||
"wait", "terminate", "key_down", "key_up"],
|
||||
"type": "string"
|
||||
},
|
||||
"keys": {"description": "Required only by `action=key`.", "type": "array"},
|
||||
|
||||
Reference in New Issue
Block a user