ver Mar19thv2

supplemented at info back for som setting
2024-03-19 18:41:55 +08:00
parent 05336a8ecf
commit 4df088e2ad
3 changed files with 59 additions and 26 deletions
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET

 from PIL import Image, ImageDraw, ImageFont

-from typing import Tuple
+from typing import Tuple, List

 def find_leaf_nodes(xlm_file_str):
    if not xlm_file_str:
@@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:

    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
    sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
-    keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
+    keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
    return keeps

 def filter_nodes(root: ET, platform="ubuntu", check_image=False):
@@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
    draw = ImageDraw.Draw(image)
    marks = []
    drew_nodes = []
+    text_informations: List[str] = ["index\ttag\tname\ttext"]

    try:
        # Adjust the path to the font file you have or use a default one
@@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
                #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
                draw.rectangle(text_bbox, fill='black')
                draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
-                index += 1

                # each mark is an x, y, w, h tuple
                marks.append([coords[0], coords[1], size[0], size[1]])
                drew_nodes.append(_node)

+                if _node.text:
+                    node_text = ( _node.text if '"' not in _node.text\
+                             else '"{:}"'.format(_node.text.replace('"', '""'))
+                                )
+                elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
+                        and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
+                    node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
+                    node_text = (node_text if '"' not in node_text\
+                             else '"{:}"'.format(node_text.replace('"', '""'))
+                                )
+                else:
+                    node_text = '""'
+                text_information: str = "{:d}\t{:}\t{:}\t{:}"\
+                                            .format( index, _node.tag
+                                                   , _node.get("name", "")
+                                                   , node_text
+                                                   )
+                text_informations.append(text_information)
+
+                index += 1
+
            except ValueError:
                pass

    # Save the result
    image.save(output_image_file_path)
-    return marks, drew_nodes
+    return marks, drew_nodes, "\n".join(text_informations)


 def print_nodes_with_indent(nodes, indent=0):
@@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0):

 if __name__ == '__main__':
    import json
-    with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
+    with open('3.xml', 'r', encoding='utf-8') as f:
        xml_file_str = f.read()
    filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
    print(len(filtered_nodes))
-    masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
-                               , 'selection_sorted(imaged).ai.png'
+    masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
+                               , '3.png'
                               )

    # print(masks)
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree):
    # leaf_nodes = find_leaf_nodes(accessibility_tree)
    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))

-    linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
+    linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"]
    # Linearize the accessibility tree nodes into a table format

    for node in filtered_nodes:
-        linearized_accessibility_tree += node.tag + "\t"
-        linearized_accessibility_tree += node.attrib.get('name') + "\t"
+        #linearized_accessibility_tree += node.tag + "\t"
+        #linearized_accessibility_tree += node.attrib.get('name') + "\t"
        if node.text:
-            linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
-                node.text.replace('"', '""'))) + "\t"
+            text = ( node.text if '"' not in node.text\
+                else '"{:}"'.format(node.text.replace('"', '""'))
+                   )
        elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
                and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
            text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
-            linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
-                text.replace('"', '""'))) + "\t"
+            text = (text if '"' not in text\
+                else '"{:}"'.format(text.replace('"', '""'))
+                   )
        else:
-            linearized_accessibility_tree += '""\t'
-        linearized_accessibility_tree += node.attrib.get(
-            '{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t"
-        linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
+            text = '""'
+        #linearized_accessibility_tree += node.attrib.get(
+                #, "") + "\t"
+        #linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
+        linearized_accessibility_tree.append(
+                "{:}\t{:}\t{:}\t{:}\t{:}".format(
+                    node.tag, node.get("name", ""), text
+                  , node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "")
+                  , node.get('{uri:deskat:component.at-spi.gnome.org}size', "")
+                  )
+              )

-    return linearized_accessibility_tree
+    return "\n".join(linearized_accessibility_tree)


 def tag_screenshot(screenshot, accessibility_tree):
@@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree):
    # nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
    nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
    # Make tag screenshot
-    marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
+    marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)

-    return marks, drew_nodes, tagged_screenshot_file_path
+    return marks, drew_nodes, tagged_screenshot_file_path, element_list


 def parse_actions_from_string(input_string):
@@ -395,11 +404,13 @@ class PromptAgent:
            })
        elif self.observation_type == "som":
            # Add som to the screenshot
-            masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
+            masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
            base64_image = encode_image(tagged_screenshot)
+            logger.debug("LINEAR AT: %s", linearized_accessibility_tree)

            self.observations.append({
-                "screenshot": base64_image
+                "screenshot": base64_image,
+                "accessibility_tree": linearized_accessibility_tree
            })

            messages.append({
@@ -407,7 +418,8 @@ class PromptAgent:
                "content": [
                    {
                        "type": "text",
-                        "text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
+                        "text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                            linearized_accessibility_tree)
                    },
                    {
                        "type": "image_url",
@@ -774,7 +786,7 @@ class PromptAgent:
            if response.status_code == HTTPStatus.OK:
                try:
                    return response.json()['output']['choices'][0]['message']['content']
-                except Exception as e:
+                except Exception:
                    return ""
            else:
                print(response.code)  # The error code.
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
 SYS_PROMPT_IN_SOM_OUT_TAG = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
+For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.

 You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
 You can replace x, y in the code with the tag of the element you want to operate with. such as: