Implement heuristic cutting on the accessibility tree to get the important nodes; Finish accessibility tree text agent

This commit is contained in:
Timothyxxx
2024-01-16 16:43:32 +08:00
parent 48a86d36cf
commit 186bf2e97c
11 changed files with 218 additions and 34 deletions

View File

@@ -1,10 +1,12 @@
import base64
import json
import re
import time
from typing import Dict, List
import requests
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
@@ -81,9 +83,9 @@ class GPT4_Agent:
{
"type": "text",
"text": {
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
},
]
}
@@ -94,12 +96,27 @@ class GPT4_Agent:
Predict the next action(s) based on the current observation.
"""
accessibility_tree = obs["accessibility_tree"]
leaf_nodes = find_leaf_nodes(accessibility_tree)
filtered_nodes = filter_nodes(leaf_nodes)
linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
# Linearize the accessibility tree nodes into a table format
for node in filtered_nodes:
linearized_accessibility_tree += node.tag + "\t"
linearized_accessibility_tree += node.attrib.get('name') + "\t"
linearized_accessibility_tree += node.attrib.get(
'{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
self.trajectory.append({
"role": "user",
"content": [
{
"type": "text",
"text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)
"text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
linearized_accessibility_tree)
}
]
})
@@ -117,7 +134,16 @@ class GPT4_Agent:
"messages": self.trajectory,
"max_tokens": self.max_tokens
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
while True:
try:
response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
json=payload)
break
except:
print("Failed to generate response, retrying...")
time.sleep(5)
pass
try:
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])