Clean code; Refactor environment to pass screenshot content instead of path

2024-04-13 23:34:01 +08:00
parent b9ae9b72b2
commit 9c75df5dce
10 changed files with 144 additions and 213 deletions
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -1,8 +1,9 @@
+import io
 import xml.etree.ElementTree as ET
+from typing import Tuple, List

 from PIL import Image, ImageDraw, ImageFont

-from typing import Tuple, List

 def find_leaf_nodes(xlm_file_str):
    if not xlm_file_str:
@@ -24,65 +25,70 @@ def find_leaf_nodes(xlm_file_str):
    collect_leaf_nodes(root, leaf_nodes)
    return leaf_nodes

+
 state_ns = "uri:deskat:state.at-spi.gnome.org"
 component_ns = "uri:deskat:component.at-spi.gnome.org"
+
+
 def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
-    keeps: bool = node.tag.startswith("document")\
-               or node.tag.endswith("item")\
-               or node.tag.endswith("button")\
-               or node.tag.endswith("heading")\
-               or node.tag.endswith("label")\
-               or node.tag.endswith("scrollbar")\
-               or node.tag.endswith("searchbox")\
-               or node.tag.endswith("textbox")\
-               or node.tag.endswith("link")\
-               or node.tag.endswith("tabelement")\
-               or node.tag.endswith("textfield")\
-               or node.tag.endswith("textarea")\
-               or node.tag.endswith("menu")\
-               or node.tag in { "alert", "canvas", "check-box"
-                              , "combo-box", "entry", "icon"
-                              , "image", "paragraph", "scroll-bar"
-                              , "section", "slider", "static"
-                              , "table-cell", "terminal", "text"
-                              , "netuiribbontab", "start", "trayclockwclass"
-                              , "traydummysearchcontrol", "uiimage", "uiproperty"
-                              , "uiribboncommandbar"
-                              }
-    keeps = keeps and ( platform=="ubuntu"\
-                        and node.get("{{{:}}}showing".format(state_ns), "false")=="true"\
-                        and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
-                     or platform=="windows"\
-                        and node.get("{{{:}}}visible".format(state_ns), "false")=="true"\
-                      )\
-                  and ( node.get("{{{:}}}enabled".format(state_ns), "false")=="true"\
-                     or node.get("{{{:}}}editable".format(state_ns), "false")=="true"\
-                     or node.get("{{{:}}}expandable".format(state_ns), "false")=="true"\
-                     or node.get("{{{:}}}checkable".format(state_ns), "false")=="true"
-                      )\
-                  and ( node.get("name", "") != "" or node.text is not None and len(node.text)>0\
-                     or check_image and node.get("image", "false")=="true"
-                      )
+    keeps: bool = node.tag.startswith("document") \
+                  or node.tag.endswith("item") \
+                  or node.tag.endswith("button") \
+                  or node.tag.endswith("heading") \
+                  or node.tag.endswith("label") \
+                  or node.tag.endswith("scrollbar") \
+                  or node.tag.endswith("searchbox") \
+                  or node.tag.endswith("textbox") \
+                  or node.tag.endswith("link") \
+                  or node.tag.endswith("tabelement") \
+                  or node.tag.endswith("textfield") \
+                  or node.tag.endswith("textarea") \
+                  or node.tag.endswith("menu") \
+                  or node.tag in {"alert", "canvas", "check-box"
+                      , "combo-box", "entry", "icon"
+                      , "image", "paragraph", "scroll-bar"
+                      , "section", "slider", "static"
+                      , "table-cell", "terminal", "text"
+                      , "netuiribbontab", "start", "trayclockwclass"
+                      , "traydummysearchcontrol", "uiimage", "uiproperty"
+                      , "uiribboncommandbar"
+                                  }
+    keeps = keeps and (platform == "ubuntu" \
+                       and node.get("{{{:}}}showing".format(state_ns), "false") == "true" \
+                       and node.get("{{{:}}}visible".format(state_ns), "false") == "true" \
+                       or platform == "windows" \
+                       and node.get("{{{:}}}visible".format(state_ns), "false") == "true" \
+                       ) \
+            and (node.get("{{{:}}}enabled".format(state_ns), "false") == "true" \
+                 or node.get("{{{:}}}editable".format(state_ns), "false") == "true" \
+                 or node.get("{{{:}}}expandable".format(state_ns), "false") == "true" \
+                 or node.get("{{{:}}}checkable".format(state_ns), "false") == "true"
+                 ) \
+            and (node.get("name", "") != "" or node.text is not None and len(node.text) > 0 \
+                 or check_image and node.get("image", "false") == "true"
+                 )

    coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
    sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
-    keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
+    keeps = keeps and coordinates[0] >= 0 and coordinates[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
    return keeps

+
 def filter_nodes(root: ET, platform="ubuntu", check_image=False):
    filtered_nodes = []

    for node in root.iter():
        if judge_node(node, platform, check_image):
            filtered_nodes.append(node)
-            #print(ET.tostring(node, encoding="unicode"))
+            # print(ET.tostring(node, encoding="unicode"))

    return filtered_nodes


-def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sampling_ratio=1.0):
+def draw_bounding_boxes(nodes, image_file_content, down_sampling_ratio=1.0):
    # Load the screenshot image
-    image = Image.open(image_file_path)
+    image_stream = io.BytesIO(image_file_content)
+    image = Image.open(image_stream)
    if float(down_sampling_ratio) != 1.0:
        image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
    draw = ImageDraw.Draw(image)
@@ -140,11 +146,11 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sam

                # Draw index number at the bottom left of the bounding box with black background
                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
-                text_bbox: Tuple[int, int ,int ,int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
-                #offset: int = bottom_right[1]-text_bbox[3]
-                #text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
+                text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
+                # offset: int = bottom_right[1]-text_bbox[3]
+                # text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)

-                #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
+                # draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
                draw.rectangle(text_bbox, fill='black')
                draw.text(text_position, str(index), font=font, anchor="lb", fill="white")

@@ -153,22 +159,22 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sam
                drew_nodes.append(_node)

                if _node.text:
-                    node_text = ( _node.text if '"' not in _node.text\
-                             else '"{:}"'.format(_node.text.replace('"', '""'))
-                                )
+                    node_text = (_node.text if '"' not in _node.text \
+                                     else '"{:}"'.format(_node.text.replace('"', '""'))
+                                 )
                elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
                        and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
                    node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
-                    node_text = (node_text if '"' not in node_text\
-                             else '"{:}"'.format(node_text.replace('"', '""'))
-                                )
+                    node_text = (node_text if '"' not in node_text \
+                                     else '"{:}"'.format(node_text.replace('"', '""'))
+                                 )
                else:
                    node_text = '""'
-                text_information: str = "{:d}\t{:}\t{:}\t{:}"\
-                                            .format( index, _node.tag
-                                                   , _node.get("name", "")
-                                                   , node_text
-                                                   )
+                text_information: str = "{:d}\t{:}\t{:}\t{:}" \
+                    .format(index, _node.tag
+                            , _node.get("name", "")
+                            , node_text
+                            )
                text_informations.append(text_information)

                index += 1
@@ -176,26 +182,14 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sam
            except ValueError:
                pass

-    # Save the result
-    image.save(output_image_file_path)
-    return marks, drew_nodes, "\n".join(text_informations)
+    output_image_stream = io.BytesIO()
+    image.save(output_image_stream, format='PNG')
+    image_content = output_image_stream.getvalue()
+
+    return marks, drew_nodes, "\n".join(text_informations), image_content


 def print_nodes_with_indent(nodes, indent=0):
    for node in nodes:
        print(' ' * indent, node.tag, node.attrib)
        print_nodes_with_indent(node, indent + 2)
-
-
-if __name__ == '__main__':
-    import json
-    with open('3.xml', 'r', encoding='utf-8') as f:
-        xml_file_str = f.read()
-    filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
-    print(len(filtered_nodes))
-    masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
-                               , '3.png'
-                               )
-
-    # print(masks)
-    print(len(masks))