diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index 47bbca0..ccb4886 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -40,6 +40,8 @@ def filter_nodes(nodes): filtered_nodes.append(node) elif node.tag == 'text': continue + elif node.get("name") == "" and node.text is None: + continue else: coords = tuple( map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', '))) diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py index 323e08f..cc08b79 100644 --- a/mm_agents/gpt_4v_agent.py +++ b/mm_agents/gpt_4v_agent.py @@ -39,7 +39,10 @@ def linearize_accessibility_tree(accessibility_tree): for node in filtered_nodes: linearized_accessibility_tree += node.tag + "\t" linearized_accessibility_tree += node.attrib.get('name') + "\t" - linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t" + if node.text: + linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t" + else: + linearized_accessibility_tree += '""\t' linearized_accessibility_tree += node.attrib.get( '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t" linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n" @@ -87,7 +90,7 @@ def parse_actions_from_string(input_string): try: action_dict = json.loads(input_string) return [action_dict] - except json.JSONDecodeError as e: + except json.JSONDecodeError: raise ValueError("Invalid response format: " + input_string)