sci-gui-agent-benchmark/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py

import xml.etree.ElementTree as ET

from PIL import Image, ImageDraw, ImageFont


def find_leaf_nodes(xml_file_path):
    root = ET.fromstring(xml_file_path)

    # Recursive function to traverse the XML tree and collect leaf nodes
    def collect_leaf_nodes(node, leaf_nodes):
        # If the node has no children, it is a leaf node, add it to the list
        if not list(node):
            leaf_nodes.append(node)
        # If the node has children, recurse on each child
        for child in node:
            collect_leaf_nodes(child, leaf_nodes)

    # List to hold all leaf nodes
    leaf_nodes = []
    collect_leaf_nodes(root, leaf_nodes)
    return leaf_nodes


def filter_nodes(nodes):
    filtered_nodes = []

    for node in nodes:
        if not node.get('{uri:deskat:state.at-spi.gnome.org}visible', None) == 'true':
            # Not visible
            continue
        # Check if the node is a 'panel'
        if node.tag == 'panel':
            # Check if the 'panel' represents an interactive element
            # or if it has certain attributes that are of interest.
            # Add your conditions here...
            if node.get('{uri:deskat:state.at-spi.gnome.org}focusable', 'false') == 'true':
                filtered_nodes.append(node)
        elif node.tag == 'text':
            continue
        else:
            coords = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', ')))
            if coords[0] < 0 or coords[1] < 0:
                continue
            size = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', ')))
            if size[0] <= 0 or size[1] <= 0:
                continue
            # Node is not a 'panel', add to the list.
            filtered_nodes.append(node)

    return filtered_nodes


def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
    # Load the screenshot image
    image = Image.open(image_file_path)
    draw = ImageDraw.Draw(image)

    # Optional: Load a font. If you don't specify a font, a default one will be used.
    try:
        # Adjust the path to the font file you have or use a default one
        font = ImageFont.truetype("arial.ttf", 20)
    except IOError:
        # Fallback to a basic font if the specified font can't be loaded
        font = ImageFont.load_default()

    # Loop over all the visible nodes and draw their bounding boxes
    for index, _node in enumerate(nodes):
        coords_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord')
        size_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size')

        if coords_str and size_str:
            try:
                # Parse the coordinates and size from the strings
                coords = tuple(map(int, coords_str.strip('()').split(', ')))
                size = tuple(map(int, size_str.strip('()').split(', ')))

                # Check for negative sizes
                if size[0] <= 0 or size[1] <= 0:
                    raise ValueError(f"Size must be positive, got: {size}")

                # Calculate the bottom-right corner of the bounding box
                bottom_right = (coords[0] + size[0], coords[1] + size[1])

                # Check that bottom_right > coords (x1 >= x0, y1 >= y0)
                if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
                    raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")

                # Draw rectangle on image
                draw.rectangle([coords, bottom_right], outline="red", width=2)

                # Draw index number at the bottom left of the bounding box
                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
                draw.text(text_position, str(index), font=font, fill="purple")

            except ValueError as e:
                pass

    # Save the result
    image.save(output_image_file_path)


if __name__ == '__main__':
    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
        xml_string = f.read()
    image_file_path = 'screenshot.png'  # Replace with your actual screenshot image path
    output_image_file_path = 'annotated_screenshot.png'  # Replace with your desired output image path

    leaf_nodes = find_leaf_nodes(xml_string)
    filtered_nodes = filter_nodes(leaf_nodes)
    print(f"Found {len(filtered_nodes)} filtered nodes")

    for node in filtered_nodes:
        print(node.tag, node.attrib)

    draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)