FIx corner cases (val connection in chrome when using playwright, and action parsing for agent, and accessibility tree xml handling)

2024-01-16 22:00:01 +08:00
parent 186bf2e97c
commit 20b1d950a0
5 changed files with 51 additions and 30 deletions
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -3,8 +3,11 @@ import xml.etree.ElementTree as ET
 from PIL import Image, ImageDraw, ImageFont


-def find_leaf_nodes(xml_file_path):
-    root = ET.fromstring(xml_file_path)
+def find_leaf_nodes(xlm_file_str):
+    if not xlm_file_str:
+        return []
+
+    root = ET.fromstring(xlm_file_str)

    # Recursive function to traverse the XML tree and collect leaf nodes
    def collect_leaf_nodes(node, leaf_nodes):
@@ -97,19 +100,3 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):

    # Save the result
    image.save(output_image_file_path)
-
-
-if __name__ == '__main__':
-    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
-        xml_string = f.read()
-    image_file_path = 'screenshot.png'  # Replace with your actual screenshot image path
-    output_image_file_path = 'annotated_screenshot.png'  # Replace with your desired output image path
-
-    leaf_nodes = find_leaf_nodes(xml_string)
-    filtered_nodes = filter_nodes(leaf_nodes)
-    print(f"Found {len(filtered_nodes)} filtered nodes")
-
-    for node in filtered_nodes:
-        print(node.tag, node.attrib)
-
-    draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -61,11 +61,27 @@ def parse_code_from_string(input_string):
    # so the code inside backticks can span multiple lines.

    # matches now contains all the captured code snippets
-    return matches
+
+    codes = []
+
+    for match in matches:
+        match = match.strip()
+        commands = ['WAIT', 'DONE', 'FAIL']  # fixme: updates this part when we have more commands
+
+        if match in commands:
+            codes.append(match.strip())
+        elif match.split('\n')[-1] in commands:
+            if len(match.split('\n')) > 1:
+                codes.append("\n".join(match.split('\n')[:-1]))
+            codes.append(match.split('\n')[-1])
+        else:
+            codes.append(match)
+
+    return codes


 class GPT4_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"):
+    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"):
        self.instruction = instruction
        self.model = model
        self.max_tokens = max_tokens
@@ -121,14 +137,17 @@ class GPT4_Agent:
            ]
        })

+        # print(
+        #     "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+        #         linearized_accessibility_tree)
+        # )
+
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
            if len(self.trajectory[i]["content"]) > 1:
                traj_to_show.append("screenshot_obs")

-        print("Trajectory:", traj_to_show)
-
        payload = {
            "model": self.model,
            "messages": self.trajectory,
--- a/mm_agents/gui_som/READAME.md
+++ b/mm_agents/gui_som/READAME.md
@@ -0,0 +1 @@
+Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.
				`@@ -0,0 +1 @@`
				Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.