Implement heuristic cutting on the accessibility tree to get the important nodes; Finish accessibility tree text agent

2024-01-16 16:43:32 +08:00
parent 48a86d36cf
commit 186bf2e97c
11 changed files with 218 additions and 34 deletions
--- a/mm_agents/gemini_pro_agent.py
+++ b/mm_agents/gemini_pro_agent.py
@@ -1,11 +1,12 @@
+import time
 from typing import Dict, List

-import PIL.Image
 import google.generativeai as genai

-from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
 from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string


 class GeminiPro_Agent:
@@ -36,9 +37,25 @@ class GeminiPro_Agent:
        Only support single-round conversation, only fill-in the last desktop screenshot.
        """
        accessibility_tree = obs["accessibility_tree"]
+
+        leaf_nodes = find_leaf_nodes(accessibility_tree)
+        filtered_nodes = filter_nodes(leaf_nodes)
+
+        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
+        # Linearize the accessibility tree nodes into a table format
+
+        for node in filtered_nodes:
+            linearized_accessibility_tree += node.tag + "\t"
+            linearized_accessibility_tree += node.attrib.get('name') + "\t"
+            linearized_accessibility_tree += node.attrib.get(
+                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
+            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
+
        self.trajectory.append({
            "role": "user",
-            "parts": ["Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)]
+            "parts": [
+                "Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                    linearized_accessibility_tree)]
        })

        # todo: Remove this step once the Gemini supports multi-round conversation
@@ -71,13 +88,20 @@ class GeminiPro_Agent:

        print("Trajectory:", traj_to_show)

-        response = self.model.generate_content(
-            message_for_gemini,
-            generation_config={
-                "max_output_tokens": self.max_tokens,
-                "temperature": self.temperature
-            }
-        )
+        while True:
+            try:
+                response = self.model.generate_content(
+                    message_for_gemini,
+                    generation_config={
+                        "max_output_tokens": self.max_tokens,
+                        "temperature": self.temperature
+                    }
+                )
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass

        try:
            response_text = response.text