Implement heuristic cutting on the accessibility tree to get the important nodes; Finish accessibility tree text agent

2024-01-16 16:43:32 +08:00
parent 48a86d36cf
commit 186bf2e97c
11 changed files with 218 additions and 34 deletions
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -1,10 +1,12 @@
 import base64
 import json
 import re
+import time
 from typing import Dict, List

 import requests

+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
 from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE

@@ -81,9 +83,9 @@ class GPT4_Agent:
                    {
                        "type": "text",
                        "text": {
-                            "computer_13": SYS_PROMPT_ACTION,
-                            "pyautogui": SYS_PROMPT_CODE
-                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
+                                    "computer_13": SYS_PROMPT_ACTION,
+                                    "pyautogui": SYS_PROMPT_CODE
+                                }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                    },
                ]
            }
@@ -94,12 +96,27 @@ class GPT4_Agent:
        Predict the next action(s) based on the current observation.
        """
        accessibility_tree = obs["accessibility_tree"]
+
+        leaf_nodes = find_leaf_nodes(accessibility_tree)
+        filtered_nodes = filter_nodes(leaf_nodes)
+
+        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
+        # Linearize the accessibility tree nodes into a table format
+
+        for node in filtered_nodes:
+            linearized_accessibility_tree += node.tag + "\t"
+            linearized_accessibility_tree += node.attrib.get('name') + "\t"
+            linearized_accessibility_tree += node.attrib.get(
+                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
+            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
+
        self.trajectory.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
-                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)
+                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                        linearized_accessibility_tree)
                }
            ]
        })
@@ -117,7 +134,16 @@ class GPT4_Agent:
            "messages": self.trajectory,
            "max_tokens": self.max_tokens
        }
-        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
+
+        while True:
+            try:
+                response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
+                                         json=payload)
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass

        try:
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])