feat: 增强任务步骤注入与a11y状态表达，提升树形交互稳定性

- 打通 metadata.steps 传递链路，将任务步骤注入 agent 预测上下文 - 优化 a11y tree 线性化输出：使用中心坐标并新增 states 列（expanded/collapsed/selected 等） - 放宽可保留节点条件，保留无文本输入类控件（edit/textfield/searchbox 等） - 强化输出约束：单轮仅允许动作代码或 WAIT/DONE/FAIL，禁止动作与 DONE 同轮返回 - 补充 avogadro 示例步骤：展开 aromatics 并选择 benzene.cjson
2026-02-26 18:56:53 +08:00
parent 07e66490dd
commit b75f6bf341
6 changed files with 54 additions and 11 deletions
--- a/evaluation_examples/examples/avogadro/building-organic-molecules_task1.json
+++ b/evaluation_examples/examples/avogadro/building-organic-molecules_task1.json
@@ -39,6 +39,6 @@
  "possibility_of_env_change": "low",
  "metadata": {
    "input_files": [],
-    "steps": "1. 点击 Build → Insert → Molecule。\n2. 搜索 'benzene' 并确定插入该分子。\n3. 确保苯环显示在工作界面中。"
+    "steps": "1. 点击菜单栏 Build(构建) → Insert(插入) → Molecule(分子…)，打开\"插入片段\"对话框。\n2. 在\"筛选\"输入框中输入 benzene（注意：需要先切换到英文输入法再输入）。\n3. 筛选结果会显示一个 aromatics 文件夹（树形结构），需要双击或点击展开该文件夹。\n4. 展开后选中列表中的 benzene.cjson 文件。\n5. 点击\"插入\"按钮将苯环插入到工作区。\n6. 关闭\"插入片段\"对话框，确认苯环已显示在主工作界面中。"
  }
 }
--- a/lib_run_single.py
+++ b/lib_run_single.py
@@ -9,7 +9,7 @@ from lib_results_logger import log_task_completion
 logger = logging.getLogger("desktopenv.experiment")
-def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores):
+def run_single_example(agent, env, example, max_steps, instruction, args, example_result_dir, scores, metadata_steps=""):
    runtime_logger = setup_logger(example, example_result_dir)
    # Reset environment first to get fresh VM IP
@@ -36,7 +36,8 @@ def run_single_example(agent, env, example, max_steps, instruction, args, exampl
        logger.info(f"Step {step_idx + 1} prediction...")
        response, actions = agent.predict(
            instruction,
-            obs
+            obs,
            metadata_steps=metadata_steps,
        )
        logger.info(f"Response: {response}")
        logger.info(f"Actions: {actions}")
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -114,6 +114,12 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
            and (
                    node.get("name", "") != "" or node.text is not None and len(node.text) > 0 \
                    or check_image and node.get("image", "false") == "true"
                    # Keep empty input fields (edit/textfield) - they are important interactive elements
                    # even without name/text (e.g., search boxes, filter inputs)
                    or node.tag.endswith("edit") or node.tag.endswith("textfield")
                    or node.tag.endswith("textarea") or node.tag.endswith("textbox")
                    or node.tag.endswith("searchbox") or node.tag.endswith("combobox")
                    or node.tag in {"entry", "combo-box", "check-box", "slider"}
            )
    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -126,7 +126,7 @@ def linearize_accessibility_tree(accessibility_tree, platform="ubuntu"):
        raise ValueError("Invalid platform, must be 'ubuntu' or 'windows'")
    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)
-    linearized_accessibility_tree = ["tag\tname\ttext\tclass\tdescription\tposition (top-left x&y)\tsize (w&h)"]
+    linearized_accessibility_tree = ["tag\tname\ttext\tposition (center x&y)\tsize (w&h)\tstates"]
    # Linearize the accessibility tree nodes into a table format
    for node in filtered_nodes:
@@ -145,14 +145,36 @@ def linearize_accessibility_tree(accessibility_tree, platform="ubuntu"):
        else:
            text = '""'
        # Compute center coordinates from top-left + size/2
        coords_str = node.get('{{{:}}}screencoord'.format(_component_ns), "")
        size_str = node.get('{{{:}}}size'.format(_component_ns), "")
        if coords_str and size_str:
            try:
                cx, cy = coords_str.strip('()').split(', ')
                sw, sh = size_str.strip('()').split(', ')
                center_x = int(cx) + int(sw) // 2
                center_y = int(cy) + int(sh) // 2
                center_str = "({:d}, {:d})".format(center_x, center_y)
            except (ValueError, IndexError):
                center_str = coords_str
        else:
            center_str = coords_str
        # Extract useful UI states (expanded/collapsed/checked/selected/focused)
        state_flags = []
        for state_name in ["expanded", "collapsed", "checked", "selected", "focused", "pressed"]:
            val = node.get("{{{:}}}{:}".format(_state_ns, state_name), "")
            if val == "true":
                state_flags.append(state_name)
        states_str = ",".join(state_flags) if state_flags else ""
        linearized_accessibility_tree.append(
-            "{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(
+            "{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(
                node.tag, node.get("name", ""),
                text,
-                node.get("{{{:}}}class".format(_attributes_ns), "") if platform == "ubuntu" else node.get("{{{:}}}class".format(class_ns_windows), ""),
+                center_str,
-                node.get("{{{:}}}description".format(_attributes_ns), ""),
+                size_str,
-                node.get('{{{:}}}screencoord'.format(_component_ns), ""),
+                states_str
                node.get('{{{:}}}size'.format(_component_ns), "")
            )
        )
@@ -332,11 +354,13 @@ class PromptAgent:
        self.system_message = self.system_message.format(CLIENT_PASSWORD=self.client_password, SCREEN_WIDTH=self.screen_width, SCREEN_HEIGHT=self.screen_height)
-    def predict(self, instruction: str, obs: Dict) -> List:
+    def predict(self, instruction: str, obs: Dict, metadata_steps: str = "") -> List:
        """
        Predict the next action(s) based on the current observation.
        """
        system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)
        if metadata_steps:
            system_message += "\n\nHere are the reference steps from the software tutorial, which may help you complete the task:\n{}".format(metadata_steps)
        # Prepare the payload for the API call
        messages = []
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -15,6 +15,8 @@ When you think you have to wait for some time, return ```WAIT```;
 When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
 First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """.strip()
@@ -36,6 +38,8 @@ When you think you have to wait for some time, return ```WAIT```;
 When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
 Our past communication is great, and what you have done is very helpful. I will now give you another task to complete.
 First take a deep breath, think step by step, give the current screenshot a thinking, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
@@ -550,6 +554,8 @@ When you think you have to wait for some time, return ```WAIT```;
 When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
 First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """.strip()
@@ -817,6 +823,8 @@ When you think you have to wait for some time, return ```WAIT```;
 When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
 My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
 First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """.strip()
--- a/run.py
+++ b/run.py
@@ -198,8 +198,11 @@ def test(args: argparse.Namespace, test_all_meta: dict) -> None:
            logger.info(f"[Example ID]: {example_id}")
            instruction = example["instruction"]
            metadata_steps = example.get("metadata", {}).get("steps", "")
            logger.info(f"[Instruction]: {instruction}")
            if metadata_steps:
                logger.info(f"[Metadata Steps]: {metadata_steps}")
            # wandb each example config settings
            cfg_args["instruction"] = instruction
            cfg_args["start_time"] = datetime.datetime.now().strftime(
@@ -227,6 +230,7 @@ def test(args: argparse.Namespace, test_all_meta: dict) -> None:
                    args,
                    example_result_dir,
                    scores,
                    metadata_steps=metadata_steps,
                )
            except Exception as e:
                logger.error(f"Exception in {domain}/{example_id}: {e}")