From 20b1d950a0b06de827e809eec548929992e20a6e Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 22:00:01 +0800
Subject: [PATCH] FIx corner cases (val connection in chrome when using
 playwright, and action parsing for agent, and accessibility tree xml
 handling)

---
 desktop_env/evaluators/getters/chrome.py      | 16 ++++++++---
 experiment_pure_text.py                       | 14 ++++++----
 .../heuristic_retrieve.py                     | 23 ++++------------
 mm_agents/gpt_4_agent.py                      | 27 ++++++++++++++++---
 mm_agents/gui_som/READAME.md                  |  1 +
 5 files changed, 51 insertions(+), 30 deletions(-)
 create mode 100644 mm_agents/gui_som/READAME.md

diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py
index 1b77016..d58d08b 100644
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -159,9 +159,19 @@ def get_open_tabs_info(env, config: Dict[str, str]):
         tabs_info = []
         for context in browser.contexts:
             for page in context.pages:
-                title = page.title()
-                url = page.url
-                tabs_info.append({'title': title, 'url': url})
+                try:
+                    # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
+                    page.wait_for_load_state('load')  # Wait for the 'load' event to complete
+                    title = page.title()
+                    url = page.url
+                    tabs_info.append({'title': title, 'url': url})
+                except TimeoutError:
+                    # If page loading times out, catch the exception and store the current information in the list
+                    tabs_info.append({'title': 'Load timeout', 'url': page.url})
+                except Exception as e:
+                    # Catch other potential exceptions that might occur while reading the page title
+                    print(f'Error: {e}')
+                    tabs_info.append({'title': 'Error encountered', 'url': page.url})
 
         browser.close()
         return tabs_info
diff --git a/experiment_pure_text.py b/experiment_pure_text.py
index 4ab5c97..cfcbd46 100644
--- a/experiment_pure_text.py
+++ b/experiment_pure_text.py
@@ -111,21 +111,25 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
 if __name__ == "__main__":
     action_space = "pyautogui"
     example_class = "chrome"
-    example_id = "06fe7178-4491-4589-810f-2e2bc9502122"
+    example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263"
+    gpt4_model = "gpt-4-1106-preview"
+    gemini_model = "gemini-pro-vision"
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
     example["snapshot"] = "exp_setup4"
 
     api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space)
 
     # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
 
     root_trajectory_dir = "exp_trajectory"
 
-    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
+    example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id)
+    # example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id)
+
     os.makedirs(example_trajectory_dir, exist_ok=True)
 
-    run_one_example(example, agent, 10, example_trajectory_dir)
+    run_one_example(example, agent, 15, example_trajectory_dir)
diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index 7e4a74e..d6f83eb 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -3,8 +3,11 @@ import xml.etree.ElementTree as ET
 from PIL import Image, ImageDraw, ImageFont
 
 
-def find_leaf_nodes(xml_file_path):
-    root = ET.fromstring(xml_file_path)
+def find_leaf_nodes(xlm_file_str):
+    if not xlm_file_str:
+        return []
+
+    root = ET.fromstring(xlm_file_str)
 
     # Recursive function to traverse the XML tree and collect leaf nodes
     def collect_leaf_nodes(node, leaf_nodes):
@@ -97,19 +100,3 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
 
     # Save the result
     image.save(output_image_file_path)
-
-
-if __name__ == '__main__':
-    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
-        xml_string = f.read()
-    image_file_path = 'screenshot.png'  # Replace with your actual screenshot image path
-    output_image_file_path = 'annotated_screenshot.png'  # Replace with your desired output image path
-
-    leaf_nodes = find_leaf_nodes(xml_string)
-    filtered_nodes = filter_nodes(leaf_nodes)
-    print(f"Found {len(filtered_nodes)} filtered nodes")
-
-    for node in filtered_nodes:
-        print(node.tag, node.attrib)
-
-    draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)
diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py
index 57a1634..aa19185 100644
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -61,11 +61,27 @@ def parse_code_from_string(input_string):
     # so the code inside backticks can span multiple lines.
 
     # matches now contains all the captured code snippets
-    return matches
+
+    codes = []
+
+    for match in matches:
+        match = match.strip()
+        commands = ['WAIT', 'DONE', 'FAIL']  # fixme: updates this part when we have more commands
+
+        if match in commands:
+            codes.append(match.strip())
+        elif match.split('\n')[-1] in commands:
+            if len(match.split('\n')) > 1:
+                codes.append("\n".join(match.split('\n')[:-1]))
+            codes.append(match.split('\n')[-1])
+        else:
+            codes.append(match)
+
+    return codes
 
 
 class GPT4_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"):
+    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"):
         self.instruction = instruction
         self.model = model
         self.max_tokens = max_tokens
@@ -121,14 +137,17 @@ class GPT4_Agent:
             ]
         })
 
+        # print(
+        #     "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+        #         linearized_accessibility_tree)
+        # )
+
         traj_to_show = []
         for i in range(len(self.trajectory)):
             traj_to_show.append(self.trajectory[i]["content"][0]["text"])
             if len(self.trajectory[i]["content"]) > 1:
                 traj_to_show.append("screenshot_obs")
 
-        print("Trajectory:", traj_to_show)
-
         payload = {
             "model": self.model,
             "messages": self.trajectory,
diff --git a/mm_agents/gui_som/READAME.md b/mm_agents/gui_som/READAME.md
new file mode 100644
index 0000000..05b15ba
--- /dev/null
+++ b/mm_agents/gui_som/READAME.md
@@ -0,0 +1 @@
+Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.
\ No newline at end of file