Implement heuristic cutting on the accessibility tree to get the important nodes; Finish accessibility tree text agent
This commit is contained in:
@@ -1,11 +1,12 @@
|
||||
import time
|
||||
from typing import Dict, List
|
||||
|
||||
import PIL.Image
|
||||
import google.generativeai as genai
|
||||
|
||||
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
|
||||
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
|
||||
from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
|
||||
from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
|
||||
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
|
||||
|
||||
|
||||
class GeminiPro_Agent:
|
||||
@@ -36,9 +37,25 @@ class GeminiPro_Agent:
|
||||
Only support single-round conversation, only fill-in the last desktop screenshot.
|
||||
"""
|
||||
accessibility_tree = obs["accessibility_tree"]
|
||||
|
||||
leaf_nodes = find_leaf_nodes(accessibility_tree)
|
||||
filtered_nodes = filter_nodes(leaf_nodes)
|
||||
|
||||
linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
|
||||
for node in filtered_nodes:
|
||||
linearized_accessibility_tree += node.tag + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get(
|
||||
'{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
|
||||
|
||||
self.trajectory.append({
|
||||
"role": "user",
|
||||
"parts": ["Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)]
|
||||
"parts": [
|
||||
"Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
|
||||
linearized_accessibility_tree)]
|
||||
})
|
||||
|
||||
# todo: Remove this step once the Gemini supports multi-round conversation
|
||||
@@ -71,13 +88,20 @@ class GeminiPro_Agent:
|
||||
|
||||
print("Trajectory:", traj_to_show)
|
||||
|
||||
response = self.model.generate_content(
|
||||
message_for_gemini,
|
||||
generation_config={
|
||||
"max_output_tokens": self.max_tokens,
|
||||
"temperature": self.temperature
|
||||
}
|
||||
)
|
||||
while True:
|
||||
try:
|
||||
response = self.model.generate_content(
|
||||
message_for_gemini,
|
||||
generation_config={
|
||||
"max_output_tokens": self.max_tokens,
|
||||
"temperature": self.temperature
|
||||
}
|
||||
)
|
||||
break
|
||||
except:
|
||||
print("Failed to generate response, retrying...")
|
||||
time.sleep(5)
|
||||
pass
|
||||
|
||||
try:
|
||||
response_text = response.text
|
||||
|
||||
Reference in New Issue
Block a user