EvoCUA Update (2025.01.05) (#412)

* evocua init * setup max_token * evocua update --------- Co-authored-by: xuetaofeng <xuetaofeng@meituan.com> Co-authored-by: Tianbao Xie <47296835+Timothyxxx@users.noreply.github.com>
2026-01-05 16:14:53 +08:00
parent 439e178a2e
commit 5ef8bdfa35
3 changed files with 110 additions and 24 deletions
--- a/mm_agents/evocua/evocua_agent.py
+++ b/mm_agents/evocua/evocua_agent.py
@@ -317,7 +317,26 @@ Previous actions:
                    args = tool_call["arguments"]
                    action = args["action"]
-                    if action == "left_click":
+                    def _clean_keys(raw_keys):
                        keys = raw_keys if isinstance(raw_keys, list) else [raw_keys]
                        cleaned_keys = []
                        for key in keys:
                            if isinstance(key, str):
                                if key.startswith("keys=["):
                                    key = key[6:]
                                if key.endswith("]"):
                                    key = key[:-1]
                                if key.startswith("['") or key.startswith('["'):
                                    key = key[2:] if len(key) > 2 else key
                                if key.endswith("']") or key.endswith('"]'):
                                    key = key[:-2] if len(key) > 2 else key
                                key = key.strip()
                                cleaned_keys.append(key)
                            else:
                                cleaned_keys.append(key)
                        return cleaned_keys
                    if action == "left_click" or action == "click":
                        if "coordinate" in args:
                            x, y = args["coordinate"]
                            adj_x, adj_y = adjust_coordinates(x, y)
@@ -355,6 +374,16 @@ Previous actions:
                        else:
                            pyautogui_code.append("pyautogui.doubleClick()")
                    elif action == "triple_click":
                        if "coordinate" in args:
                            x, y = args["coordinate"]
                            adj_x, adj_y = adjust_coordinates(x, y)
                            pyautogui_code.append(
                                f"pyautogui.tripleClick({adj_x}, {adj_y})"
                            )
                        else:
                            pyautogui_code.append("pyautogui.tripleClick()")
                    elif action == "type":
                        text = args.get("text", "")
@@ -383,24 +412,7 @@ Previous actions:
                    elif action == "key":
-                        keys = args.get("keys", [])
+                        keys = _clean_keys(args.get("keys", []))
                        if isinstance(keys, list):
                            cleaned_keys = []
                            for key in keys:
                                if isinstance(key, str):
                                    if key.startswith("keys=["):
                                        key = key[6:]
                                    if key.endswith("]"):
                                        key = key[:-1]
                                    if key.startswith("['") or key.startswith('["'):
                                        key = key[2:] if len(key) > 2 else key
                                    if key.endswith("']") or key.endswith('"]'):
                                        key = key[:-2] if len(key) > 2 else key
                                    key = key.strip()
                                    cleaned_keys.append(key)
                                else:
                                    cleaned_keys.append(key)
                            keys = cleaned_keys
                        keys_str = ", ".join([f"'{key}'" for key in keys])
                        if len(keys) > 1:
@@ -408,6 +420,16 @@ Previous actions:
                        else:
                            pyautogui_code.append(f"pyautogui.press({keys_str})")
                    elif action == "key_down":
                        keys = _clean_keys(args.get("keys", []))
                        for k in keys:
                            pyautogui_code.append(f"pyautogui.keyDown('{k}')")
                    elif action == "key_up":
                        keys = _clean_keys(args.get("keys", []))
                        for k in reversed(keys):
                            pyautogui_code.append(f"pyautogui.keyUp('{k}')")
                    elif action == "scroll":
                        pixels = args.get("pixels", 0)
                        pyautogui_code.append(f"pyautogui.scroll({pixels})")
@@ -416,7 +438,15 @@ Previous actions:
                        pyautogui_code.append("WAIT")
                    elif action == "terminate":
-                        pyautogui_code.append("DONE")
+                        # Termination should respect status:
                        # - success -> DONE
                        # - failure -> FAIL
                        # Backward compatible: missing status defaults to success.
                        status = args.get("status", "success")
                        if str(status).lower() == "failure":
                            pyautogui_code.append("FAIL")
                        else:
                            pyautogui_code.append("DONE")
                    elif action == "mouse_move":
                        if "coordinate" in args:
@@ -481,7 +511,11 @@ Previous actions:
            process_tool_call("\n".join(current_tool_call))
        if not low_level_instruction and len(pyautogui_code) > 0:
-            action_type = pyautogui_code[0].split(".", 1)[1].split("(", 1)[0]
+            first_action = pyautogui_code[0]
            if "." in first_action:
                action_type = first_action.split(".", 1)[1].split("(", 1)[0]
            else:
                action_type = first_action.lower()
            low_level_instruction = f"Performing {action_type} action"
        return low_level_instruction, pyautogui_code
--- a/mm_agents/evocua/prompts.py
+++ b/mm_agents/evocua/prompts.py
@@ -60,6 +60,8 @@ S1_ACTION_HISTORY_TEMPLATE = "## Action:\n{action}\n"
 # S2 Prompts
 S2_ACTION_DESCRIPTION = """
 * `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.
 * `key_down`: Press and HOLD the specified key(s) down in order (no release). Use this for stateful holds like holding Shift while clicking.
 * `key_up`: Release the specified key(s) in reverse order.
 * `type`: Type a string of text on the keyboard.
 * `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.
 * `left_click`: Click the left mouse button at a specified (x, y) pixel coordinate on the screen.
@@ -67,7 +69,7 @@ S2_ACTION_DESCRIPTION = """
 * `right_click`: Click the right mouse button at a specified (x, y) pixel coordinate on the screen.
 * `middle_click`: Click the middle mouse button at a specified (x, y) pixel coordinate on the screen.
 * `double_click`: Double-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
-* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen (simulated as double-click since it's the closest action).
+* `triple_click`: Triple-click the left mouse button at a specified (x, y) pixel coordinate on the screen.
 * `scroll`: Performs a scroll of the mouse scroll wheel.
 * `hscroll`: Performs a horizontal scroll (mapped to regular scroll).
 * `wait`: Wait specified seconds for the change to happen.
@@ -76,7 +78,7 @@ S2_ACTION_DESCRIPTION = """
 """
 S2_DESCRIPTION_PROMPT_TEMPLATE = """Use a mouse and keyboard to interact with a computer, and take screenshots.
-* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.
+* This is an interface to a desktop GUI. You must click on desktop icons to start applications.
 * Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.
 {resolution_info}
 * Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.
@@ -122,7 +124,8 @@ def build_s2_tools_def(description_prompt):
                    "action": {
                        "description": S2_ACTION_DESCRIPTION,
                        "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", 
-                                 "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], 
+                                 "right_click", "middle_click", "double_click", "triple_click", "scroll", 
                                 "wait", "terminate", "key_down", "key_up"], 
                        "type": "string"
                    },
                    "keys": {"description": "Required only by `action=key`.", "type": "array"}, 
--- a/run_multienv_evocua.py
+++ b/run_multienv_evocua.py
@@ -19,6 +19,7 @@
            --test_all_meta_path evaluation_examples/test_nogdrive.json \
            --max_steps 50 \
            --num_envs 30 \
            --temperature 0.01 \
            --max_history_turns 4 \
            --coordinate_type relative \
            --resize_factor 32 \
@@ -63,6 +64,42 @@ active_environments = []
 processes = []
 is_terminating = False
 # Thread-local storage for task context (works per-process in multiprocessing)
 import threading
 _task_context = threading.local()
 def get_task_context():
    """Get current task context from thread-local storage."""
    return getattr(_task_context, 'context', {'domain': None, 'example_id': None})
 def set_task_context(domain: str, example_id: str):
    """Set current task context in thread-local storage."""
    _task_context.context = {'domain': domain, 'example_id': example_id}
 def clear_task_context():
    """Clear current task context."""
    if hasattr(_task_context, 'context'):
        delattr(_task_context, 'context')
 class TaskContextFilter(logging.Filter):
    """Filter to add domain and example_id to log records."""
    def filter(self, record):
        ctx = get_task_context()
        domain = ctx.get('domain')
        example_id = ctx.get('example_id')
        if domain and example_id:
            record.domain = domain
            record.example_id = example_id
            # Add prefix to message
            if hasattr(record, 'msg') and isinstance(record.msg, str):
                if not record.msg.startswith(f"[{domain}/{example_id}]"):
                    record.msg = f"[{domain}/{example_id}] {record.msg}"
        else:
            record.domain = domain or "N/A"
            record.example_id = example_id or "N/A"
        return True
 # load the environment variables from .env file
 if os.path.exists(".env"):
    from dotenv import load_dotenv
@@ -169,6 +206,12 @@ file_handler.setFormatter(formatter)
 debug_handler.setFormatter(formatter)
 stdout_handler.setFormatter(formatter)
 # Add task context filter to all handlers
 task_filter = TaskContextFilter()
 file_handler.addFilter(task_filter)
 debug_handler.addFilter(task_filter)
 stdout_handler.addFilter(task_filter)
 stdout_handler.addFilter(logging.Filter("desktopenv"))
 logger.addHandler(file_handler)
@@ -213,6 +256,7 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
            enable_proxy=True,
            client_password=args.client_password
        )
        active_environments.append(env)
        logger.info(f"Process {current_process().name} started.")
@@ -222,6 +266,7 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
            except Exception:
                break
            domain, example_id = item
            set_task_context(domain, example_id)
            try:
                config_file = os.path.join(
                    args.test_config_base_dir, f"examples/{domain}/{example_id}.json"
@@ -273,12 +318,14 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
                    import traceback
                    logger.error(f"Exception in {current_process().name} {domain}/{example_id}: {e}")
                    logger.error(traceback.format_exc())
                    try:
                        env.controller.end_recording(
                            os.path.join(example_result_dir, "recording.mp4")
                        )
                    except Exception as rec_e:
                        logger.error(f"Failed to end recording: {rec_e}")
                    with open(os.path.join(example_result_dir, "traj.jsonl"), "a") as f:
                        f.write(json.dumps({"Error": f"{domain}/{example_id} - {e}"}))
                        f.write("\n")
@@ -286,6 +333,8 @@ def run_env_tasks(task_queue: Queue, args: argparse.Namespace, shared_scores: li
                logger.error(f"Task-level error in {current_process().name}: {e}")
                import traceback
                logger.error(traceback.format_exc())
            finally:
                clear_task_context()
    except Exception as e:
        logger.error(f"Process-level error in {current_process().name}: {e}")
        import traceback