feat: 增强任务步骤注入与a11y状态表达,提升树形交互稳定性
- 打通 metadata.steps 传递链路,将任务步骤注入 agent 预测上下文 - 优化 a11y tree 线性化输出:使用中心坐标并新增 states 列(expanded/collapsed/selected 等) - 放宽可保留节点条件,保留无文本输入类控件(edit/textfield/searchbox 等) - 强化输出约束:单轮仅允许动作代码或 WAIT/DONE/FAIL,禁止动作与 DONE 同轮返回 - 补充 avogadro 示例步骤:展开 aromatics 并选择 benzene.cjson
This commit is contained in:
@@ -114,6 +114,12 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
|
||||
and (
|
||||
node.get("name", "") != "" or node.text is not None and len(node.text) > 0 \
|
||||
or check_image and node.get("image", "false") == "true"
|
||||
# Keep empty input fields (edit/textfield) - they are important interactive elements
|
||||
# even without name/text (e.g., search boxes, filter inputs)
|
||||
or node.tag.endswith("edit") or node.tag.endswith("textfield")
|
||||
or node.tag.endswith("textarea") or node.tag.endswith("textbox")
|
||||
or node.tag.endswith("searchbox") or node.tag.endswith("combobox")
|
||||
or node.tag in {"entry", "combo-box", "check-box", "slider"}
|
||||
)
|
||||
|
||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(_component_ns), "(-1, -1)"))
|
||||
|
||||
@@ -126,7 +126,7 @@ def linearize_accessibility_tree(accessibility_tree, platform="ubuntu"):
|
||||
raise ValueError("Invalid platform, must be 'ubuntu' or 'windows'")
|
||||
|
||||
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)
|
||||
linearized_accessibility_tree = ["tag\tname\ttext\tclass\tdescription\tposition (top-left x&y)\tsize (w&h)"]
|
||||
linearized_accessibility_tree = ["tag\tname\ttext\tposition (center x&y)\tsize (w&h)\tstates"]
|
||||
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
for node in filtered_nodes:
|
||||
@@ -145,14 +145,36 @@ def linearize_accessibility_tree(accessibility_tree, platform="ubuntu"):
|
||||
else:
|
||||
text = '""'
|
||||
|
||||
# Compute center coordinates from top-left + size/2
|
||||
coords_str = node.get('{{{:}}}screencoord'.format(_component_ns), "")
|
||||
size_str = node.get('{{{:}}}size'.format(_component_ns), "")
|
||||
if coords_str and size_str:
|
||||
try:
|
||||
cx, cy = coords_str.strip('()').split(', ')
|
||||
sw, sh = size_str.strip('()').split(', ')
|
||||
center_x = int(cx) + int(sw) // 2
|
||||
center_y = int(cy) + int(sh) // 2
|
||||
center_str = "({:d}, {:d})".format(center_x, center_y)
|
||||
except (ValueError, IndexError):
|
||||
center_str = coords_str
|
||||
else:
|
||||
center_str = coords_str
|
||||
|
||||
# Extract useful UI states (expanded/collapsed/checked/selected/focused)
|
||||
state_flags = []
|
||||
for state_name in ["expanded", "collapsed", "checked", "selected", "focused", "pressed"]:
|
||||
val = node.get("{{{:}}}{:}".format(_state_ns, state_name), "")
|
||||
if val == "true":
|
||||
state_flags.append(state_name)
|
||||
states_str = ",".join(state_flags) if state_flags else ""
|
||||
|
||||
linearized_accessibility_tree.append(
|
||||
"{:}\t{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(
|
||||
"{:}\t{:}\t{:}\t{:}\t{:}\t{:}".format(
|
||||
node.tag, node.get("name", ""),
|
||||
text,
|
||||
node.get("{{{:}}}class".format(_attributes_ns), "") if platform == "ubuntu" else node.get("{{{:}}}class".format(class_ns_windows), ""),
|
||||
node.get("{{{:}}}description".format(_attributes_ns), ""),
|
||||
node.get('{{{:}}}screencoord'.format(_component_ns), ""),
|
||||
node.get('{{{:}}}size'.format(_component_ns), "")
|
||||
center_str,
|
||||
size_str,
|
||||
states_str
|
||||
)
|
||||
)
|
||||
|
||||
@@ -332,11 +354,13 @@ class PromptAgent:
|
||||
|
||||
self.system_message = self.system_message.format(CLIENT_PASSWORD=self.client_password, SCREEN_WIDTH=self.screen_width, SCREEN_HEIGHT=self.screen_height)
|
||||
|
||||
def predict(self, instruction: str, obs: Dict) -> List:
|
||||
def predict(self, instruction: str, obs: Dict, metadata_steps: str = "") -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)
|
||||
if metadata_steps:
|
||||
system_message += "\n\nHere are the reference steps from the software tutorial, which may help you complete the task:\n{}".format(metadata_steps)
|
||||
|
||||
# Prepare the payload for the API call
|
||||
messages = []
|
||||
|
||||
@@ -15,6 +15,8 @@ When you think you have to wait for some time, return ```WAIT```;
|
||||
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
""".strip()
|
||||
@@ -36,6 +38,8 @@ When you think you have to wait for some time, return ```WAIT```;
|
||||
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
Our past communication is great, and what you have done is very helpful. I will now give you another task to complete.
|
||||
First take a deep breath, think step by step, give the current screenshot a thinking, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
@@ -550,6 +554,8 @@ When you think you have to wait for some time, return ```WAIT```;
|
||||
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
""".strip()
|
||||
@@ -817,6 +823,8 @@ When you think you have to wait for some time, return ```WAIT```;
|
||||
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
IMPORTANT: You must return EITHER a python code block OR a special code (WAIT/DONE/FAIL) in each response, NEVER BOTH. Do NOT include ```DONE``` in the same response as a ```python``` code block. After executing an action, wait for the next observation to verify the result before returning ```DONE```.
|
||||
|
||||
My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
|
||||
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
""".strip()
|
||||
|
||||
Reference in New Issue
Block a user