diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index 9611ea3..934d8fd 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET from PIL import Image, ImageDraw, ImageFont -from typing import Tuple +from typing import Tuple, List def find_leaf_nodes(xlm_file_str): if not xlm_file_str: @@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool: coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)")) sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)")) - keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0 + keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0 return keeps def filter_nodes(root: ET, platform="ubuntu", check_image=False): @@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): draw = ImageDraw.Draw(image) marks = [] drew_nodes = [] + text_informations: List[str] = ["index\ttag\tname\ttext"] try: # Adjust the path to the font file you have or use a default one @@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black') draw.rectangle(text_bbox, fill='black') draw.text(text_position, str(index), font=font, anchor="lb", fill="white") - index += 1 # each mark is an x, y, w, h tuple marks.append([coords[0], coords[1], size[0], size[1]]) drew_nodes.append(_node) + if _node.text: + node_text = ( _node.text if '"' not in _node.text\ + else '"{:}"'.format(_node.text.replace('"', '""')) + ) + elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \ + and _node.get("{uri:deskat:value.at-spi.gnome.org}value"): + node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value") + node_text = (node_text if '"' not in node_text\ + else '"{:}"'.format(node_text.replace('"', '""')) + ) + else: + node_text = '""' + text_information: str = "{:d}\t{:}\t{:}\t{:}"\ + .format( index, _node.tag + , _node.get("name", "") + , node_text + ) + text_informations.append(text_information) + + index += 1 + except ValueError: pass # Save the result image.save(output_image_file_path) - return marks, drew_nodes + return marks, drew_nodes, "\n".join(text_informations) def print_nodes_with_indent(nodes, indent=0): @@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0): if __name__ == '__main__': import json - with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f: + with open('3.xml', 'r', encoding='utf-8') as f: xml_file_str = f.read() filtered_nodes = filter_nodes(ET.fromstring(xml_file_str)) print(len(filtered_nodes)) - masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png' - , 'selection_sorted(imaged).ai.png' + masks = draw_bounding_boxes( filtered_nodes, '3.a.png' + , '3.png' ) # print(masks) diff --git a/mm_agents/agent.py b/mm_agents/agent.py index 9613f44..f2d4b5c 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree): # leaf_nodes = find_leaf_nodes(accessibility_tree) filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree)) - linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n" + linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"] # Linearize the accessibility tree nodes into a table format for node in filtered_nodes: - linearized_accessibility_tree += node.tag + "\t" - linearized_accessibility_tree += node.attrib.get('name') + "\t" + #linearized_accessibility_tree += node.tag + "\t" + #linearized_accessibility_tree += node.attrib.get('name') + "\t" if node.text: - linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format( - node.text.replace('"', '""'))) + "\t" + text = ( node.text if '"' not in node.text\ + else '"{:}"'.format(node.text.replace('"', '""')) + ) elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \ and node.get("{uri:deskat:value.at-spi.gnome.org}value"): text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value") - linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format( - text.replace('"', '""'))) + "\t" + text = (text if '"' not in text\ + else '"{:}"'.format(text.replace('"', '""')) + ) else: - linearized_accessibility_tree += '""\t' - linearized_accessibility_tree += node.attrib.get( - '{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t" - linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n" + text = '""' + #linearized_accessibility_tree += node.attrib.get( + #, "") + "\t" + #linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n" + linearized_accessibility_tree.append( + "{:}\t{:}\t{:}\t{:}\t{:}".format( + node.tag, node.get("name", ""), text + , node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "") + , node.get('{uri:deskat:component.at-spi.gnome.org}size', "") + ) + ) - return linearized_accessibility_tree + return "\n".join(linearized_accessibility_tree) def tag_screenshot(screenshot, accessibility_tree): @@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree): # nodes = filter_nodes(find_leaf_nodes(accessibility_tree)) nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True) # Make tag screenshot - marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path) + marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path) - return marks, drew_nodes, tagged_screenshot_file_path + return marks, drew_nodes, tagged_screenshot_file_path, element_list def parse_actions_from_string(input_string): @@ -395,11 +404,13 @@ class PromptAgent: }) elif self.observation_type == "som": # Add som to the screenshot - masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"]) + masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"]) base64_image = encode_image(tagged_screenshot) + logger.debug("LINEAR AT: %s", linearized_accessibility_tree) self.observations.append({ - "screenshot": base64_image + "screenshot": base64_image, + "accessibility_tree": linearized_accessibility_tree }) messages.append({ @@ -407,7 +418,8 @@ class PromptAgent: "content": [ { "type": "text", - "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?" + "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + linearized_accessibility_tree) }, { "type": "image_url", @@ -774,7 +786,7 @@ class PromptAgent: if response.status_code == HTTPStatus.OK: try: return response.json()['output']['choices'][0]['message']['content'] - except Exception as e: + except Exception: return "" else: print(response.code) # The error code. diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py index 462aac7..c609a66 100644 --- a/mm_agents/prompts.py +++ b/mm_agents/prompts.py @@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act SYS_PROMPT_IN_SOM_OUT_TAG = """ You are an agent which follow my instruction and perform desktop computer tasks as instructed. You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. -For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image. +For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information. You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot. You can replace x, y in the code with the tag of the element you want to operate with. such as: