ver Mar19thv2
supplemented at info back for som setting
This commit is contained in:
@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
|
||||
|
||||
from PIL import Image, ImageDraw, ImageFont
|
||||
|
||||
from typing import Tuple
|
||||
from typing import Tuple, List
|
||||
|
||||
def find_leaf_nodes(xlm_file_str):
|
||||
if not xlm_file_str:
|
||||
@@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
|
||||
|
||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
|
||||
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
|
||||
keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
|
||||
keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
|
||||
return keeps
|
||||
|
||||
def filter_nodes(root: ET, platform="ubuntu", check_image=False):
|
||||
@@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
|
||||
draw = ImageDraw.Draw(image)
|
||||
marks = []
|
||||
drew_nodes = []
|
||||
text_informations: List[str] = ["index\ttag\tname\ttext"]
|
||||
|
||||
try:
|
||||
# Adjust the path to the font file you have or use a default one
|
||||
@@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
|
||||
#draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
|
||||
draw.rectangle(text_bbox, fill='black')
|
||||
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
|
||||
index += 1
|
||||
|
||||
# each mark is an x, y, w, h tuple
|
||||
marks.append([coords[0], coords[1], size[0], size[1]])
|
||||
drew_nodes.append(_node)
|
||||
|
||||
if _node.text:
|
||||
node_text = ( _node.text if '"' not in _node.text\
|
||||
else '"{:}"'.format(_node.text.replace('"', '""'))
|
||||
)
|
||||
elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
||||
and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
||||
node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
||||
node_text = (node_text if '"' not in node_text\
|
||||
else '"{:}"'.format(node_text.replace('"', '""'))
|
||||
)
|
||||
else:
|
||||
node_text = '""'
|
||||
text_information: str = "{:d}\t{:}\t{:}\t{:}"\
|
||||
.format( index, _node.tag
|
||||
, _node.get("name", "")
|
||||
, node_text
|
||||
)
|
||||
text_informations.append(text_information)
|
||||
|
||||
index += 1
|
||||
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# Save the result
|
||||
image.save(output_image_file_path)
|
||||
return marks, drew_nodes
|
||||
return marks, drew_nodes, "\n".join(text_informations)
|
||||
|
||||
|
||||
def print_nodes_with_indent(nodes, indent=0):
|
||||
@@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0):
|
||||
|
||||
if __name__ == '__main__':
|
||||
import json
|
||||
with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
|
||||
with open('3.xml', 'r', encoding='utf-8') as f:
|
||||
xml_file_str = f.read()
|
||||
filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
|
||||
print(len(filtered_nodes))
|
||||
masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
|
||||
, 'selection_sorted(imaged).ai.png'
|
||||
masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
|
||||
, '3.png'
|
||||
)
|
||||
|
||||
# print(masks)
|
||||
|
||||
@@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree):
|
||||
# leaf_nodes = find_leaf_nodes(accessibility_tree)
|
||||
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
|
||||
|
||||
linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
|
||||
linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"]
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
|
||||
for node in filtered_nodes:
|
||||
linearized_accessibility_tree += node.tag + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||
#linearized_accessibility_tree += node.tag + "\t"
|
||||
#linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||
if node.text:
|
||||
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
|
||||
node.text.replace('"', '""'))) + "\t"
|
||||
text = ( node.text if '"' not in node.text\
|
||||
else '"{:}"'.format(node.text.replace('"', '""'))
|
||||
)
|
||||
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
||||
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
||||
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
||||
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
|
||||
text.replace('"', '""'))) + "\t"
|
||||
text = (text if '"' not in text\
|
||||
else '"{:}"'.format(text.replace('"', '""'))
|
||||
)
|
||||
else:
|
||||
linearized_accessibility_tree += '""\t'
|
||||
linearized_accessibility_tree += node.attrib.get(
|
||||
'{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
|
||||
text = '""'
|
||||
#linearized_accessibility_tree += node.attrib.get(
|
||||
#, "") + "\t"
|
||||
#linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
|
||||
linearized_accessibility_tree.append(
|
||||
"{:}\t{:}\t{:}\t{:}\t{:}".format(
|
||||
node.tag, node.get("name", ""), text
|
||||
, node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "")
|
||||
, node.get('{uri:deskat:component.at-spi.gnome.org}size', "")
|
||||
)
|
||||
)
|
||||
|
||||
return linearized_accessibility_tree
|
||||
return "\n".join(linearized_accessibility_tree)
|
||||
|
||||
|
||||
def tag_screenshot(screenshot, accessibility_tree):
|
||||
@@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree):
|
||||
# nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
|
||||
nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
|
||||
# Make tag screenshot
|
||||
marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
|
||||
marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
|
||||
|
||||
return marks, drew_nodes, tagged_screenshot_file_path
|
||||
return marks, drew_nodes, tagged_screenshot_file_path, element_list
|
||||
|
||||
|
||||
def parse_actions_from_string(input_string):
|
||||
@@ -395,11 +404,13 @@ class PromptAgent:
|
||||
})
|
||||
elif self.observation_type == "som":
|
||||
# Add som to the screenshot
|
||||
masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
|
||||
masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
|
||||
base64_image = encode_image(tagged_screenshot)
|
||||
logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
|
||||
|
||||
self.observations.append({
|
||||
"screenshot": base64_image
|
||||
"screenshot": base64_image,
|
||||
"accessibility_tree": linearized_accessibility_tree
|
||||
})
|
||||
|
||||
messages.append({
|
||||
@@ -407,7 +418,8 @@ class PromptAgent:
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
|
||||
"text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
|
||||
linearized_accessibility_tree)
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
@@ -774,7 +786,7 @@ class PromptAgent:
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
try:
|
||||
return response.json()['output']['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
except Exception:
|
||||
return ""
|
||||
else:
|
||||
print(response.code) # The error code.
|
||||
|
||||
@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
|
||||
SYS_PROMPT_IN_SOM_OUT_TAG = """
|
||||
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
|
||||
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
|
||||
For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
|
||||
For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
|
||||
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||
|
||||
Reference in New Issue
Block a user