EvoCUA Update (2025.01.05) (#412)

* evocua init

* setup max_token

* evocua update

---------

Co-authored-by: xuetaofeng <xuetaofeng@meituan.com>
Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
This commit is contained in:
蘑菇先生
2026-01-05 16:14:53 +08:00
committed by GitHub
parent 439e178a2e
commit 5ef8bdfa35
3 changed files with 110 additions and 24 deletions

View File

@@ -317,7 +317,26 @@ Previous actions:
args = tool_call["arguments"] args = tool_call["arguments"]
action = args["action"] action = args["action"]
if action == "left_click": def _clean_keys(raw_keys):
keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
cleaned_keys = []
for key in keys:
if isinstance(key, str):
if key.startswith("keys=["):
key = key[6:]
if key.endswith("]"):
key = key[:-1]
if key.startswith("['") or key.startswith('["'):
key = key[2:] if len(key) > 2 else key
if key.endswith("']") or key.endswith('"]'):
key = key[:-2] if len(key) > 2 else key
key = key.strip()
cleaned_keys.append(key)
else:
cleaned_keys.append(key)
return cleaned_keys
if action == "left_click" or action == "click":
if "coordinate" in args: if "coordinate" in args:
x, y = args["coordinate"] x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y) adj_x, adj_y = adjust_coordinates(x, y)
@@ -355,6 +374,16 @@ Previous actions:
else: else:
pyautogui_code.append("pyautogui.doubleClick()") pyautogui_code.append("pyautogui.doubleClick()")
elif action == "triple_click":
if "coordinate" in args:
x, y = args["coordinate"]
adj_x, adj_y = adjust_coordinates(x, y)
pyautogui_code.append(
f"pyautogui.tripleClick({adj_x}, {adj_y})"
)
else:
pyautogui_code.append("pyautogui.tripleClick()")
elif action == "type": elif action == "type":
text = args.get("text", "") text = args.get("text", "")
@@ -383,24 +412,7 @@ Previous actions:
elif action == "key": elif action == "key":
keys = args.get("keys", []) keys = _clean_keys(args.get("keys", []))
if isinstance(keys, list):
cleaned_keys = []
for key in keys:
if isinstance(key, str):
if key.startswith("keys=["):
key = key[6:]
if key.endswith("]"):
key = key[:-1]
if key.startswith("['") or key.startswith('["'):
key = key[2:] if len(key) > 2 else key
if key.endswith("']") or key.endswith('"]'):
key = key[:-2] if len(key) > 2 else key
key = key.strip()
cleaned_keys.append(key)
else:
cleaned_keys.append(key)
keys = cleaned_keys
keys_str = ", ".join([f"'{key}'" for key in keys]) keys_str = ", ".join([f"'{key}'" for key in keys])
if len(keys) > 1: if len(keys) > 1:
@@ -408,6 +420,16 @@ Previous actions:
else: else:
pyautogui_code.append(f"pyautogui.press({keys_str})") pyautogui_code.append(f"pyautogui.press({keys_str})")
elif action == "key_down":
keys = _clean_keys(args.get("keys", []))
for k in keys:
pyautogui_code.append(f"pyautogui.keyDown('{k}')")
elif action == "key_up":
keys = _clean_keys(args.get("keys", []))
for k in reversed(keys):
pyautogui_code.append(f"pyautogui.keyUp('{k}')")
elif action == "scroll": elif action == "scroll":
pixels = args.get("pixels", 0) pixels = args.get("pixels", 0)
pyautogui_code.append(f"pyautogui.scroll({pixels})") pyautogui_code.append(f"pyautogui.scroll({pixels})")
@@ -416,7 +438,15 @@ Previous actions:
pyautogui_code.append("WAIT") pyautogui_code.append("WAIT")
elif action == "terminate": elif action == "terminate":
pyautogui_code.append("DONE") # Termination should respect status:
# - success -> DONE
# - failure -> FAIL
# Backward compatible: missing status defaults to success.
status = args.get("status", "success")
if str(status).lower() == "failure":
pyautogui_code.append("FAIL")
else:
pyautogui_code.append("DONE")
elif action == "mouse_move": elif action == "mouse_move":
if "coordinate" in args: if "coordinate" in args:
@@ -481,7 +511,11 @@ Previous actions:
process_tool_call("\n".join(current_tool_call)) process_tool_call("\n".join(current_tool_call))
if not low_level_instruction and len(pyautogui_code) > 0: if not low_level_instruction and len(pyautogui_code) > 0:
action_type = pyautogui_code[0].split(".", 1)[1].split("(", 1)[0] first_action = pyautogui_code[0]
if "." in first_action:
action_type = first_action.split(".", 1)[1].split("(", 1)[0]
else:
action_type = first_action.lower()
low_level_instruction = f"Performing {action_type} action" low_level_instruction = f"Performing {action_type} action"
return low_level_instruction, pyautogui_code return low_level_instruction, pyautogui_code

View File

@@ -60,6 +60,8 @@ S1_ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
# S2 Prompts # S2 Prompts
S2_ACTION_DESCRIPTION = """ S2_ACTION_DESCRIPTION = """
* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order. * `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
* `key_down`: Press and HOLD the specified key(s) down in order (no release). Use this for stateful holds like holding Shift while clicking.
* `key_up`: Release the specified key(s) in reverse order.
* `type`: Type a string of text on the keyboard. * `type`: Type a string of text on the keyboard.
* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen. * `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
* `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen. * `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
@@ -67,7 +69,7 @@ S2_ACTION_DESCRIPTION = """
* `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen. * `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
* `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen. * `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
* `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen. * `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action). * `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
* `scroll`: Performs a scroll of the mouse scroll wheel. * `scroll`: Performs a scroll of the mouse scroll wheel.
* `hscroll`: Performs a horizontal scroll (mapped to regular scroll). * `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
* `wait`: Wait specified seconds for the change to happen. * `wait`: Wait specified seconds for the change to happen.
@@ -76,7 +78,7 @@ S2_ACTION_DESCRIPTION = """
""" """
S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots. S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots.
* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications. * This is an interface to a desktop GUI. You must click on desktop icons to start applications.
* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot. * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
{resolution_info} {resolution_info}
* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor. * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
@@ -122,7 +124,8 @@ def build_s2_tools_def(description_prompt):
"action": { "action": {
"description": S2_ACTION_DESCRIPTION, "description": S2_ACTION_DESCRIPTION,
"enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag",
"right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "right_click", "middle_click", "double_click", "triple_click", "scroll",
"wait", "terminate", "key_down", "key_up"],
"type": "string" "type": "string"
}, },
"keys": {"description": "Required only by `action=key`.", "type": "array"}, "keys": {"description": "Required only by `action=key`.", "type": "array"},

View File

@@ -19,6 +19,7 @@
--test_all_meta_path evaluation_examples/test_nogdrive.json \ --test_all_meta_path evaluation_examples/test_nogdrive.json \
--max_steps 50 \ --max_steps 50 \
--num_envs 30 \ --num_envs 30 \
--temperature 0.01 \
--max_history_turns 4 \ --max_history_turns 4 \
--coordinate_type relative \ --coordinate_type relative \
--resize_factor 32 \ --resize_factor 32 \
@@ -63,6 +64,42 @@ active_environments = []
processes = [] processes = []
is_terminating = False is_terminating = False
# Thread-local storage for task context (works per-process in multiprocessing)
import threading
_task_context = threading.local()
def get_task_context():
"""Get current task context from thread-local storage."""
return getattr(_task_context, 'context', {'domain': None, 'example_id': None})
def set_task_context(domain: str, example_id: str):
"""Set current task context in thread-local storage."""
_task_context.context = {'domain': domain, 'example_id': example_id}
def clear_task_context():
"""Clear current task context."""
if hasattr(_task_context, 'context'):
delattr(_task_context, 'context')
class TaskContextFilter(logging.Filter):
"""Filter to add domain and example_id to log records."""
def filter(self, record):
ctx = get_task_context()
domain = ctx.get('domain')
example_id = ctx.get('example_id')
if domain and example_id:
record.domain = domain
record.example_id = example_id
# Add prefix to message
if hasattr(record, 'msg') and isinstance(record.msg, str):
if not record.msg.startswith(f"[{domain}/{example_id}]"):
record.msg = f"[{domain}/{example_id}] {record.msg}"
else:
record.domain = domain or "N/A"
record.example_id = example_id or "N/A"
return True
# load the environment variables from .env file # load the environment variables from .env file
if os.path.exists(".env"): if os.path.exists(".env"):
from dotenv import load_dotenv from dotenv import load_dotenv
@@ -169,6 +206,12 @@ file_handler.setFormatter(formatter)
debug_handler.setFormatter(formatter) debug_handler.setFormatter(formatter)
stdout_handler.setFormatter(formatter) stdout_handler.setFormatter(formatter)
# Add task context filter to all handlers
task_filter = TaskContextFilter()
file_handler.addFilter(task_filter)
debug_handler.addFilter(task_filter)
stdout_handler.addFilter(task_filter)
stdout_handler.addFilter(logging.Filter("desktopenv")) stdout_handler.addFilter(logging.Filter("desktopenv"))
logger.addHandler(file_handler) logger.addHandler(file_handler)
@@ -213,6 +256,7 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
enable_proxy=True, enable_proxy=True,
client_password=args.client_password client_password=args.client_password
) )
active_environments.append(env) active_environments.append(env)
logger.info(f"Process {current_process().name} started.") logger.info(f"Process {current_process().name} started.")
@@ -222,6 +266,7 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
except Exception: except Exception:
break break
domain, example_id = item domain, example_id = item
set_task_context(domain, example_id)
try: try:
config_file = os.path.join( config_file = os.path.join(
args.test_config_base_dir, f"examples/{domain}/{example_id}.json" args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
@@ -273,12 +318,14 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
import traceback import traceback
logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}") logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}")
logger.error(traceback.format_exc()) logger.error(traceback.format_exc())
try: try:
env.controller.end_recording( env.controller.end_recording(
os.path.join(example_result_dir, "recording.mp4") os.path.join(example_result_dir, "recording.mp4")
) )
except Exception as rec_e: except Exception as rec_e:
logger.error(f"Failed to end recording: {rec_e}") logger.error(f"Failed to end recording: {rec_e}")
with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f: with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"})) f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"}))
f.write("\n") f.write("\n")
@@ -286,6 +333,8 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
logger.error(f"Task-level error in {current_process().name}: {e}") logger.error(f"Task-level error in {current_process().name}: {e}")
import traceback import traceback
logger.error(traceback.format_exc()) logger.error(traceback.format_exc())
finally:
clear_task_context()
except Exception as e: except Exception as e:
logger.error(f"Process-level error in {current_process().name}: {e}") logger.error(f"Process-level error in {current_process().name}: {e}")
import traceback import traceback