ver Mar19thv2

supplemented at info back for som setting
This commit is contained in:
David Chang
2024-03-19 18:41:55 +08:00
parent 05336a8ecf
commit 4df088e2ad
3 changed files with 59 additions and 26 deletions

View File

@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
from PIL import Image, ImageDraw, ImageFont
from typing import Tuple
from typing import Tuple, List
def find_leaf_nodes(xlm_file_str):
if not xlm_file_str:
@@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
return keeps
def filter_nodes(root: ET, platform="ubuntu", check_image=False):
@@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
draw = ImageDraw.Draw(image)
marks = []
drew_nodes = []
text_informations: List[str] = ["index\ttag\tname\ttext"]
try:
# Adjust the path to the font file you have or use a default one
@@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
#draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
draw.rectangle(text_bbox, fill='black')
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
index += 1
# each mark is an x, y, w, h tuple
marks.append([coords[0], coords[1], size[0], size[1]])
drew_nodes.append(_node)
if _node.text:
node_text = ( _node.text if '"' not in _node.text\
else '"{:}"'.format(_node.text.replace('"', '""'))
)
elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
node_text = (node_text if '"' not in node_text\
else '"{:}"'.format(node_text.replace('"', '""'))
)
else:
node_text = '""'
text_information: str = "{:d}\t{:}\t{:}\t{:}"\
.format( index, _node.tag
, _node.get("name", "")
, node_text
)
text_informations.append(text_information)
index += 1
except ValueError:
pass
# Save the result
image.save(output_image_file_path)
return marks, drew_nodes
return marks, drew_nodes, "\n".join(text_informations)
def print_nodes_with_indent(nodes, indent=0):
@@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0):
if __name__ == '__main__':
import json
with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
with open('3.xml', 'r', encoding='utf-8') as f:
xml_file_str = f.read()
filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
print(len(filtered_nodes))
masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
, 'selection_sorted(imaged).ai.png'
masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
, '3.png'
)
# print(masks)

View File

@@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree):
# leaf_nodes = find_leaf_nodes(accessibility_tree)
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"]
# Linearize the accessibility tree nodes into a table format
for node in filtered_nodes:
linearized_accessibility_tree += node.tag + "\t"
linearized_accessibility_tree += node.attrib.get('name') + "\t"
#linearized_accessibility_tree += node.tag + "\t"
#linearized_accessibility_tree += node.attrib.get('name') + "\t"
if node.text:
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
node.text.replace('"', '""'))) + "\t"
text = ( node.text if '"' not in node.text\
else '"{:}"'.format(node.text.replace('"', '""'))
)
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
text.replace('"', '""'))) + "\t"
text = (text if '"' not in text\
else '"{:}"'.format(text.replace('"', '""'))
)
else:
linearized_accessibility_tree += '""\t'
linearized_accessibility_tree += node.attrib.get(
'{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t"
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
text = '""'
#linearized_accessibility_tree += node.attrib.get(
#, "") + "\t"
#linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
linearized_accessibility_tree.append(
"{:}\t{:}\t{:}\t{:}\t{:}".format(
node.tag, node.get("name", ""), text
, node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "")
, node.get('{uri:deskat:component.at-spi.gnome.org}size', "")
)
)
return linearized_accessibility_tree
return "\n".join(linearized_accessibility_tree)
def tag_screenshot(screenshot, accessibility_tree):
@@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree):
# nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
# Make tag screenshot
marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
return marks, drew_nodes, tagged_screenshot_file_path
return marks, drew_nodes, tagged_screenshot_file_path, element_list
def parse_actions_from_string(input_string):
@@ -395,11 +404,13 @@ class PromptAgent:
})
elif self.observation_type == "som":
# Add som to the screenshot
masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
base64_image = encode_image(tagged_screenshot)
logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
self.observations.append({
"screenshot": base64_image
"screenshot": base64_image,
"accessibility_tree": linearized_accessibility_tree
})
messages.append({
@@ -407,7 +418,8 @@ class PromptAgent:
"content": [
{
"type": "text",
"text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
"text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
linearized_accessibility_tree)
},
{
"type": "image_url",
@@ -774,7 +786,7 @@ class PromptAgent:
if response.status_code == HTTPStatus.OK:
try:
return response.json()['output']['choices'][0]['message']['content']
except Exception as e:
except Exception:
return ""
else:
print(response.code) # The error code.

View File

@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
SYS_PROMPT_IN_SOM_OUT_TAG = """
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as: