ver Mar19thv2
supplemented at info back for som setting
This commit is contained in:
@@ -2,7 +2,7 @@ import xml.etree.ElementTree as ET
|
|||||||
|
|
||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
from typing import Tuple
|
from typing import Tuple, List
|
||||||
|
|
||||||
def find_leaf_nodes(xlm_file_str):
|
def find_leaf_nodes(xlm_file_str):
|
||||||
if not xlm_file_str:
|
if not xlm_file_str:
|
||||||
@@ -66,7 +66,7 @@ def judge_node(node: ET, platform="ubuntu", check_image=False) -> bool:
|
|||||||
|
|
||||||
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
|
coordinates: Tuple[int, int] = eval(node.get("{{{:}}}screencoord".format(component_ns), "(-1, -1)"))
|
||||||
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
|
sizes: Tuple[int, int] = eval(node.get("{{{:}}}size".format(component_ns), "(-1, -1)"))
|
||||||
keeps = keeps and coordinates[0]>0 and coordinates[1]>0 and sizes[0]>0 and sizes[1]>0
|
keeps = keeps and coordinates[0]>=0 and coordinates[1]>=0 and sizes[0]>0 and sizes[1]>0
|
||||||
return keeps
|
return keeps
|
||||||
|
|
||||||
def filter_nodes(root: ET, platform="ubuntu", check_image=False):
|
def filter_nodes(root: ET, platform="ubuntu", check_image=False):
|
||||||
@@ -86,6 +86,7 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
|
|||||||
draw = ImageDraw.Draw(image)
|
draw = ImageDraw.Draw(image)
|
||||||
marks = []
|
marks = []
|
||||||
drew_nodes = []
|
drew_nodes = []
|
||||||
|
text_informations: List[str] = ["index\ttag\tname\ttext"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# Adjust the path to the font file you have or use a default one
|
# Adjust the path to the font file you have or use a default one
|
||||||
@@ -135,18 +136,38 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
|
|||||||
#draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
|
#draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
|
||||||
draw.rectangle(text_bbox, fill='black')
|
draw.rectangle(text_bbox, fill='black')
|
||||||
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
|
draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
|
||||||
index += 1
|
|
||||||
|
|
||||||
# each mark is an x, y, w, h tuple
|
# each mark is an x, y, w, h tuple
|
||||||
marks.append([coords[0], coords[1], size[0], size[1]])
|
marks.append([coords[0], coords[1], size[0], size[1]])
|
||||||
drew_nodes.append(_node)
|
drew_nodes.append(_node)
|
||||||
|
|
||||||
|
if _node.text:
|
||||||
|
node_text = ( _node.text if '"' not in _node.text\
|
||||||
|
else '"{:}"'.format(_node.text.replace('"', '""'))
|
||||||
|
)
|
||||||
|
elif _node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
||||||
|
and _node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
||||||
|
node_text: str = _node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
||||||
|
node_text = (node_text if '"' not in node_text\
|
||||||
|
else '"{:}"'.format(node_text.replace('"', '""'))
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
node_text = '""'
|
||||||
|
text_information: str = "{:d}\t{:}\t{:}\t{:}"\
|
||||||
|
.format( index, _node.tag
|
||||||
|
, _node.get("name", "")
|
||||||
|
, node_text
|
||||||
|
)
|
||||||
|
text_informations.append(text_information)
|
||||||
|
|
||||||
|
index += 1
|
||||||
|
|
||||||
except ValueError:
|
except ValueError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Save the result
|
# Save the result
|
||||||
image.save(output_image_file_path)
|
image.save(output_image_file_path)
|
||||||
return marks, drew_nodes
|
return marks, drew_nodes, "\n".join(text_informations)
|
||||||
|
|
||||||
|
|
||||||
def print_nodes_with_indent(nodes, indent=0):
|
def print_nodes_with_indent(nodes, indent=0):
|
||||||
@@ -157,12 +178,12 @@ def print_nodes_with_indent(nodes, indent=0):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
import json
|
import json
|
||||||
with open('selection_sorted(imaged).xml', 'r', encoding='utf-8') as f:
|
with open('3.xml', 'r', encoding='utf-8') as f:
|
||||||
xml_file_str = f.read()
|
xml_file_str = f.read()
|
||||||
filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
|
filtered_nodes = filter_nodes(ET.fromstring(xml_file_str))
|
||||||
print(len(filtered_nodes))
|
print(len(filtered_nodes))
|
||||||
masks = draw_bounding_boxes( filtered_nodes, 'selection_sorted(imaged).png'
|
masks = draw_bounding_boxes( filtered_nodes, '3.a.png'
|
||||||
, 'selection_sorted(imaged).ai.png'
|
, '3.png'
|
||||||
)
|
)
|
||||||
|
|
||||||
# print(masks)
|
# print(masks)
|
||||||
|
|||||||
@@ -37,27 +37,36 @@ def linearize_accessibility_tree(accessibility_tree):
|
|||||||
# leaf_nodes = find_leaf_nodes(accessibility_tree)
|
# leaf_nodes = find_leaf_nodes(accessibility_tree)
|
||||||
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
|
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
|
||||||
|
|
||||||
linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
|
linearized_accessibility_tree = ["tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)"]
|
||||||
# Linearize the accessibility tree nodes into a table format
|
# Linearize the accessibility tree nodes into a table format
|
||||||
|
|
||||||
for node in filtered_nodes:
|
for node in filtered_nodes:
|
||||||
linearized_accessibility_tree += node.tag + "\t"
|
#linearized_accessibility_tree += node.tag + "\t"
|
||||||
linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
#linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||||
if node.text:
|
if node.text:
|
||||||
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
|
text = ( node.text if '"' not in node.text\
|
||||||
node.text.replace('"', '""'))) + "\t"
|
else '"{:}"'.format(node.text.replace('"', '""'))
|
||||||
|
)
|
||||||
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
||||||
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
||||||
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
||||||
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
|
text = (text if '"' not in text\
|
||||||
text.replace('"', '""'))) + "\t"
|
else '"{:}"'.format(text.replace('"', '""'))
|
||||||
|
)
|
||||||
else:
|
else:
|
||||||
linearized_accessibility_tree += '""\t'
|
text = '""'
|
||||||
linearized_accessibility_tree += node.attrib.get(
|
#linearized_accessibility_tree += node.attrib.get(
|
||||||
'{uri:deskat:component.at-spi.gnome.org}screencoord', "") + "\t"
|
#, "") + "\t"
|
||||||
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
|
#linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size', "") + "\n"
|
||||||
|
linearized_accessibility_tree.append(
|
||||||
|
"{:}\t{:}\t{:}\t{:}\t{:}".format(
|
||||||
|
node.tag, node.get("name", ""), text
|
||||||
|
, node.get('{uri:deskat:component.at-spi.gnome.org}screencoord', "")
|
||||||
|
, node.get('{uri:deskat:component.at-spi.gnome.org}size', "")
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
return linearized_accessibility_tree
|
return "\n".join(linearized_accessibility_tree)
|
||||||
|
|
||||||
|
|
||||||
def tag_screenshot(screenshot, accessibility_tree):
|
def tag_screenshot(screenshot, accessibility_tree):
|
||||||
@@ -68,9 +77,9 @@ def tag_screenshot(screenshot, accessibility_tree):
|
|||||||
# nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
|
# nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
|
||||||
nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
|
nodes = filter_nodes(ET.fromstring(accessibility_tree), check_image=True)
|
||||||
# Make tag screenshot
|
# Make tag screenshot
|
||||||
marks, drew_nodes = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
|
marks, drew_nodes, element_list = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
|
||||||
|
|
||||||
return marks, drew_nodes, tagged_screenshot_file_path
|
return marks, drew_nodes, tagged_screenshot_file_path, element_list
|
||||||
|
|
||||||
|
|
||||||
def parse_actions_from_string(input_string):
|
def parse_actions_from_string(input_string):
|
||||||
@@ -395,11 +404,13 @@ class PromptAgent:
|
|||||||
})
|
})
|
||||||
elif self.observation_type == "som":
|
elif self.observation_type == "som":
|
||||||
# Add som to the screenshot
|
# Add som to the screenshot
|
||||||
masks, drew_nodes, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
|
masks, drew_nodes, tagged_screenshot, linearized_accessibility_tree = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
|
||||||
base64_image = encode_image(tagged_screenshot)
|
base64_image = encode_image(tagged_screenshot)
|
||||||
|
logger.debug("LINEAR AT: %s", linearized_accessibility_tree)
|
||||||
|
|
||||||
self.observations.append({
|
self.observations.append({
|
||||||
"screenshot": base64_image
|
"screenshot": base64_image,
|
||||||
|
"accessibility_tree": linearized_accessibility_tree
|
||||||
})
|
})
|
||||||
|
|
||||||
messages.append({
|
messages.append({
|
||||||
@@ -407,7 +418,8 @@ class PromptAgent:
|
|||||||
"content": [
|
"content": [
|
||||||
{
|
{
|
||||||
"type": "text",
|
"type": "text",
|
||||||
"text": "Given the tagged screenshot as below. What's the next step that you will do to help with the task?"
|
"text": "Given the tagged screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
|
||||||
|
linearized_accessibility_tree)
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"type": "image_url",
|
"type": "image_url",
|
||||||
@@ -774,7 +786,7 @@ class PromptAgent:
|
|||||||
if response.status_code == HTTPStatus.OK:
|
if response.status_code == HTTPStatus.OK:
|
||||||
try:
|
try:
|
||||||
return response.json()['output']['choices'][0]['message']['content']
|
return response.json()['output']['choices'][0]['message']['content']
|
||||||
except Exception as e:
|
except Exception:
|
||||||
return ""
|
return ""
|
||||||
else:
|
else:
|
||||||
print(response.code) # The error code.
|
print(response.code) # The error code.
|
||||||
|
|||||||
@@ -801,7 +801,7 @@ You CAN predict multiple actions at one step, but you should only return one act
|
|||||||
SYS_PROMPT_IN_SOM_OUT_TAG = """
|
SYS_PROMPT_IN_SOM_OUT_TAG = """
|
||||||
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
|
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
|
||||||
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
|
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
|
||||||
For each step, you will get an observation of the desktop by a screenshot with interact-able elements marked with numerical tags. And you will predict the action of the computer based on the image.
|
For each step, you will get an observation of the desktop by 1) a screenshot with interact-able elements marked with numerical tags; and 2) accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the image and test information.
|
||||||
|
|
||||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||||
|
|||||||
Reference in New Issue
Block a user