From 493b71982197971304212c630b3fcb9a2a99d940 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Mon, 15 Jan 2024 21:58:33 +0800
Subject: [PATCH 1/6] Add gemini agent implementation; Add missed requirements;
 Minor fix some small bugs

---
 desktop_env/envs/desktop_env.py   |  7 +++-
 experiment.py                     | 26 ++++++-------
 mm_agents/fuyu_test.py            | 20 ----------
 mm_agents/gemini_agent.py         | 65 ++++++++++++++++++++-----------
 mm_agents/gemini_agent_text.py    |  0
 mm_agents/gpt_4v_agent.py         | 25 ++++--------
 mm_agents/gpt_4v_agent_text.py    |  0
 mm_agents/gpt_4v_prompt_action.py |  2 +-
 mm_agents/gpt_4v_prompt_code.py   | 18 ++++++---
 requirements.txt                  |  2 +
 10 files changed, 82 insertions(+), 83 deletions(-)
 delete mode 100644 mm_agents/fuyu_test.py
 create mode 100644 mm_agents/gemini_agent_text.py
 create mode 100644 mm_agents/gpt_4v_agent_text.py

diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index e2ef08b..8a79d72 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -231,8 +231,11 @@ class DesktopEnv(gym.Env):
             # the set of all possible actions defined in the action representation
             self.controller.execute_action(action)
         elif self.action_space == "pyautogui":
-            # the set of all possible python commands insides `pyautogui`
-            self.controller.execute_python_command(action)
+            if action in ['WAIT', 'FAIL', 'DONE']:
+                self.controller.execute_action(action)
+            else:
+                # the set of all possible python commands insides `pyautogui`
+                self.controller.execute_python_command(action)
 
         observation = {
             "screenshot": self._get_obs(),
diff --git a/experiment.py b/experiment.py
index 48bb1dc..f6d1232 100644
--- a/experiment.py
+++ b/experiment.py
@@ -6,6 +6,7 @@ import sys
 
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.gpt_4v_agent import GPT4v_Agent
+from mm_agents.gemini_agent import GeminiPro_Agent
 
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
@@ -44,7 +45,7 @@ logger = logging.getLogger("desktopenv.experiment")
 PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 
 
-def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True):
+def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
     trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
     env = DesktopEnv(
         path_to_vm=PATH_TO_VM,
@@ -53,7 +54,6 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
     )
     # reset the environment to certain snapshot
     observation = env.reset()
-    observation['instruction'] = example['instruction']
     done = False
     step_num = 0
 
@@ -63,17 +63,14 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
 
     while not done and step_num < max_steps:
         actions = agent.predict(observation)
+        step_num += 1
         for action in actions:
-            step_num += 1
-
             # Capture the timestamp before executing the action
             action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_num, action)
 
             observation, reward, done, info = env.step(action)
-            observation['instruction'] = example['instruction']
 
-            # Logging
-            logger.info("Step %d: %s", step_num, action)
             logger.info("Reward: %.2f", reward)
             logger.info("Done: %s", done)
             logger.info("Info: %s", info)
@@ -114,19 +111,22 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
 
 if __name__ == "__main__":
     action_space = "pyautogui"
-    example_class = "vlc"
-    example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e"
+    example_class = "thunderbird"
+    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
-    example["snapshot"] = "exp_setup"
+    example["snapshot"] = "exp_setup2"
 
-    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, action_space=action_space)
+    # api_key = os.environ.get("OPENAI_API_KEY")
+    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    api_key = os.environ.get("GENAI_API_KEY")
+    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
 
     root_trajectory_dir = "exp_trajectory"
 
     example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
     os.makedirs(example_trajectory_dir, exist_ok=True)
 
-    run_one_example(example, agent, 2, example_trajectory_dir)
+    run_one_example(example, agent, 10, example_trajectory_dir)
diff --git a/mm_agents/fuyu_test.py b/mm_agents/fuyu_test.py
deleted file mode 100644
index ea77186..0000000
--- a/mm_agents/fuyu_test.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from transformers import FuyuProcessor, FuyuForCausalLM
-from PIL import Image
-
-image = Image.open("stackoverflow.png").convert("RGB")
-
-# load model and processor
-model_id = "adept/fuyu-8b"
-processor = FuyuProcessor.from_pretrained(model_id)
-model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0")
-
-# prepare inputs for the model
-text_prompt = "Description:\n"
-
-inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
-
-# autoregressively generate text
-generation_output = model.generate(**inputs, max_new_tokens=100)
-generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True)
-
-print(generation_text)
diff --git a/mm_agents/gemini_agent.py b/mm_agents/gemini_agent.py
index 37e22f2..1593c9a 100644
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_agent.py
@@ -1,4 +1,4 @@
-from typing import Dict
+from typing import Dict, List
 
 import PIL.Image
 import google.generativeai as genai
@@ -9,10 +9,13 @@ from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 
 
 class GeminiPro_Agent:
-    def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
-        genai.configure(api_key)
+    def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
+                 action_space="computer_13"):
+        genai.configure(api_key=api_key)
+        self.instruction = instruction
         self.model = genai.GenerativeModel(model)
         self.max_tokens = max_tokens
+        self.temperature = temperature
         self.action_space = action_space
 
         self.trajectory = [
@@ -22,22 +25,39 @@ class GeminiPro_Agent:
                     {
                         "computer_13": SYS_PROMPT_ACTION,
                         "pyautogui": SYS_PROMPT_CODE
-                    }[action_space]
+                    }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                 ]
             }
         ]
 
-    def predict(self, obs: Dict):
+    def predict(self, obs: Dict) -> List:
         """
         Predict the next action(s) based on the current observation.
+        Only support single-round conversation, only fill-in the last desktop screenshot.
         """
         img = PIL.Image.open(obs["screenshot"])
         self.trajectory.append({
             "role": "user",
-            "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
-                obs["instruction"]), img]
+            "parts": ["What's the next step that you will do to help with the task?", img]
         })
 
+        # todo: Remove this step once the Gemini supports multi-round conversation
+        all_message_str = ""
+        for i in range(len(self.trajectory)):
+            if i == 0:
+                all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
+            elif i % 2 == 1:
+                all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
+            else:
+                all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+            all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
+
+        message_for_gemini = {
+            "role": "user",
+            "parts": [all_message_str, img]
+        }
+
         traj_to_show = []
         for i in range(len(self.trajectory)):
             traj_to_show.append(self.trajectory[i]["parts"][0])
@@ -46,29 +66,28 @@ class GeminiPro_Agent:
 
         print("Trajectory:", traj_to_show)
 
-        response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
+        response = self.model.generate_content(
+            message_for_gemini,
+            generation_config={
+                "max_output_tokens": self.max_tokens,
+                "temperature": self.temperature
+            }
+        )
 
         try:
-            # fixme: change to fit the new response format from gemini pro
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+            response_text = response.text
         except:
-            # todo: add error handling
-            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
-            actions = None
+            return []
+
+        try:
+            actions = self.parse_actions(response_text)
+        except:
+            print("Failed to parse action from response:", response_text)
+            actions = []
 
         return actions
 
     def parse_actions(self, response: str):
-        # response example
-        """
-        ```json
-        {
-          "action_type": "CLICK",
-          "click_type": "RIGHT"
-        }
-        ```
-        """
-
         # parse from the response
         if self.action_space == "computer_13":
             actions = parse_actions_from_string(response)
diff --git a/mm_agents/gemini_agent_text.py b/mm_agents/gemini_agent_text.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 203b40c..81d128e 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,7 +1,7 @@
 import base64
 import json
 import re
-from typing import Dict
+from typing import Dict, List
 
 import requests
 
@@ -63,7 +63,8 @@ def parse_code_from_string(input_string):
 
 
 class GPT4v_Agent:
-    def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
+    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
+        self.instruction = instruction
         self.model = model
         self.max_tokens = max_tokens
         self.action_space = action_space
@@ -82,13 +83,13 @@ class GPT4v_Agent:
                         "text": {
                             "computer_13": SYS_PROMPT_ACTION,
                             "pyautogui": SYS_PROMPT_CODE
-                        }[action_space]
+                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                     },
                 ]
             }
         ]
 
-    def predict(self, obs: Dict):
+    def predict(self, obs: Dict) -> List:
         """
         Predict the next action(s) based on the current observation.
         """
@@ -98,8 +99,7 @@ class GPT4v_Agent:
             "content": [
                 {
                     "type": "text",
-                    "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
-                        obs["instruction"])
+                    "text": "What's the next step that you will do to help with the task?"
                 },
                 {
                     "type": "image_url",
@@ -128,23 +128,12 @@ class GPT4v_Agent:
         try:
             actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
         except:
-            # todo: add error handling
-            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            print("Failed to parse action from response:", response.json())
             actions = None
 
         return actions
 
     def parse_actions(self, response: str):
-        # response example
-        """
-        ```json
-        {
-          "action_type": "CLICK",
-          "click_type": "RIGHT"
-        }
-        ```
-        """
-
         # parse from the response
         if self.action_space == "computer_13":
             actions = parse_actions_from_string(response)
diff --git a/mm_agents/gpt_4v_agent_text.py b/mm_agents/gpt_4v_agent_text.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/gpt_4v_prompt_action.py b/mm_agents/gpt_4v_prompt_action.py
index 650b136..4323df6 100644
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -237,7 +237,7 @@ for example, format as:
 ```
 
 REMEMBER:
-For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. 
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 You MUST wrap the dict with backticks (\`).
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 You CAN predict multiple actions at one step, but you should only return one action for each step.
diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py
index 17e8c9d..aa768e9 100644
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -1,11 +1,17 @@
 SYS_PROMPT = """
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```;
+When you think the task is done, return ```DONE```.
 
-When you think you have to wait for some time, return `WAIT`.
-When you think the task can not be done, return `FAIL`.
-When you think the task is done, return `DONE`.
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a13f733..f2c4cc3 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -30,3 +30,5 @@ ImageHash
 scikit-image
 librosa
 pymupdf
+chardet
+playwright

From 9fe3a5db3b3a1e7430173dc1029b0b4348e88da8 Mon Sep 17 00:00:00 2001
From: tsuky_chen <3107760494@qq.com>
Date: Tue, 16 Jan 2024 01:07:40 +0800
Subject: [PATCH 2/6] update libreoffice impress eval

---
 desktop_env/evaluators/metrics/__init__.py    |  2 +-
 desktop_env/evaluators/metrics/impress.py     | 23 ++++++++++
 .../39478d4a-1049-456f-aa77-407811393add.json | 37 +++++++++++++++
 .../9ec204e4-f0a3-42f8-8458-b772a6797cab.json | 40 ++++++++++++++--
 .../af23762e-2bfd-4a1d-aada-20fa8de9ce07.json | 40 ++++++++++++++--
 .../c59742c0-4323-4b9d-8a02-723c251deaa0.json | 46 +++++++++++++++++--
 6 files changed, 172 insertions(+), 16 deletions(-)
 create mode 100644 evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json

diff --git a/desktop_env/evaluators/metrics/__init__.py b/desktop_env/evaluators/metrics/__init__.py
index 200b56f..59ebab2 100644
--- a/desktop_env/evaluators/metrics/__init__.py
+++ b/desktop_env/evaluators/metrics/__init__.py
@@ -14,4 +14,4 @@ from .gimp import increase_saturation, decrease_brightness, check_file_exists, c
 from .general import check_csv, check_accessibility_tree, check_list, run_sqlite3, check_json
 from .thunderbird import check_thunderbird_prefs, check_thunderbird_filter
 from .vscode import compare_text_file, compare_config, compare_answer, is_extension_installed
-from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines
+from .impress import check_slide_numbers_color, compare_pptx_files, check_for_two_lines, check_for_audio, check_formula_shape, check_file_exists
diff --git a/desktop_env/evaluators/metrics/impress.py b/desktop_env/evaluators/metrics/impress.py
index 7268aef..719ae56 100644
--- a/desktop_env/evaluators/metrics/impress.py
+++ b/desktop_env/evaluators/metrics/impress.py
@@ -71,6 +71,29 @@ def check_file_exists(directory, filename):
     file_path = os.path.join(directory, filename)
     return 1 if os.path.isfile(file_path) else 0
 
+def has_audio_on_page(slide):
+    for shape in slide.shapes:
+        if shape.shape_type == 13:
+            return True
+    return False
+
+def check_for_audio(prs):
+    prs = Presentation(prs)
+    for i, slide in enumerate(prs.slides):
+        if has_audio_on_page(slide):
+            return 1
+    return 0
+
+def check_formula_shape(prs):
+    prs = Presentation(prs)
+    slide = prs.slides[13]
+        
+    for shape in slide.shapes:
+
+        if shape.has_text_frame and shape.shape_type == 1:
+            return 1
+    return 0
+
 if __name__ == "__main__":
     path1 = "../../任务数据/LibreOffice Impress/Change_Color_Slide_Number_gold_textbox.pptx"
     presentation = Presentation(path1)
diff --git a/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json
new file mode 100644
index 0000000..e5ee5d6
--- /dev/null
+++ b/evaluation_examples/examples/libreoffice_impress/39478d4a-1049-456f-aa77-407811393add.json
@@ -0,0 +1,37 @@
+{
+    "id": "39478d4a-1049-456f-aa77-407811393add",
+    "snapshot": "libreoffice_impress",
+    "instruction": "Could you help me add a hat symbol to \"Y\" in text in page 14? I need to represent vector sign for my statistics class.",
+    "source": "https://www.reddit.com/r/libreoffice/comments/jul3o8/putting_cap_or_hat_or_carat_symbol_in_libre/",
+    "config": [
+        {
+            "type": "download",
+            "parameters": {
+                "files": [
+                    {
+                        "url": "https://drive.usercontent.google.com/download?id=1WT1-L0iiIlF2kuIK77IDxTfBaQ0X0BbX&export=download&authuser=0&confirm=t&uuid=0b69767e-1f3e-49ce-88a7-1036ef25bcaf&at=APZUnTXZ_sqEZUrHNx1edWep017b:1705337750065",
+                        "path": "Desktop/Ch5.pptx"
+                    }
+                ]
+            }
+        },
+        {
+            "type": "open",
+            "parameters": {
+                "path": "Desktop/Ch5.pptx"
+            }
+        }
+    ],
+    "trajectory": "trajectories/",
+    "related_apps": [
+        "libreoffice_impress"
+    ],
+    "evaluator": {
+        "func": "check_formula_shape",
+        "result": {
+            "type": "vm_file",
+            "path": "Desktop/Ch5.pptx",
+            "dest": "Ch5.pptx"
+        }
+    }
+}
\ No newline at end of file
diff --git a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
index 8d8255e..c825ce7 100644
--- a/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
+++ b/evaluation_examples/examples/libreoffice_impress/9ec204e4-f0a3-42f8-8458-b772a6797cab.json
@@ -1,12 +1,42 @@
 {
   "id": "9ec204e4-f0a3-42f8-8458-b772a6797cab",
   "snapshot": "libreoffice_impress",
-  "instruction": "Could you help me copy and paste the first 3 slides?",
+  "instruction": "Make a duplicate of the last two slides for me, please.",
   "source": "https://www.tiktok.com/@lil.d1rt_/video/7247574148887629083",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1ad5vUXasdN2MypNap-pBUmgPg5FaxmDA&export=download&authuser=0&confirm=t&uuid=9dc069bb-edd3-4ae9-b356-4c6543778584&at=APZUnTXbgLHjv1MhMFy1IfZL3fQI:1705338128337",
+            "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx"
+      }
+    }
+  ],
   "trajectory": "trajectories/",
   "related_apps": [
-    ""
+    "libreoffice_impress"
   ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_pptx_files",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842",
+      "dest": "MLA_Workshop_061X_Works_Cited_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/MLA_Workshop_061X_Works_Cited.pptx",
+      "dest": "MLA_Workshop_061X_Works_Cited.pptx"
+    }
+  }
+}
\ No newline at end of file
diff --git a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
index 93d0a94..db71d72 100644
--- a/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
+++ b/evaluation_examples/examples/libreoffice_impress/af23762e-2bfd-4a1d-aada-20fa8de9ce07.json
@@ -1,12 +1,42 @@
 {
   "id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
   "snapshot": "libreoffice_impress",
-  "instruction": "Please make a summary slide of the whole presentation for me",
+  "instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide. Could you make a summary slide for me?",
   "source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1zmtomIzSgSjnYZbhgtH4n90L5mV9bS7L&export=download&authuser=0&confirm=t&uuid=1966ab03-1e17-447a-aeb4-71a753eca196&at=APZUnTVoWHV6z8LJi5VHADuQE6VG:1705319233167",
+            "path": "Desktop/Forests.pptx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Forests.pptx"
+      }
+    }
+  ],
   "trajectory": "trajectories/",
   "related_apps": [
-    ""
+    "libreoffice_impress"
   ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "compare_pptx_files",
+    "expected": {
+      "type": "cloud_file",
+      "path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474",
+      "dest": "Forests_Gold.docx"
+    },
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Forests.pptx",
+      "dest": "Forests.pptx"
+    }
+  }
+}
\ No newline at end of file
diff --git a/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json b/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json
index 5321e76..9eb877c 100644
--- a/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json
+++ b/evaluation_examples/examples/libreoffice_impress/c59742c0-4323-4b9d-8a02-723c251deaa0.json
@@ -1,12 +1,48 @@
 {
   "id": "c59742c0-4323-4b9d-8a02-723c251deaa0",
   "snapshot": "libreoffice_impress",
-  "instruction": "Could you help me add video into the presentation file?",
+  "instruction": "I am making PPT about the history of baseball. I want to add an introduction audio named \"Baseball.mp3\" on the Desktop into my PPT, but I do not know how. Could you help me add audio into my presentation file?",
   "source": "https://www.reddit.com/r/libreoffice/comments/17lcdrp/audio_not_supported_in_libreoffice_impress/",
-  "config": [],
+  "config": [
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1Oy5Zga6PnvpIwJ1OHMdFf3mSbm_YClHh&export=download&authuser=0&confirm=t&uuid=da1db839-da27-4bb5-a4fc-0358342f493b&at=APZUnTVsLyGsj8qI1rPyGAUTCX4F:1705324246149",
+            "path": "Desktop/Mady_and_Mia_Baseball.pptx"
+          }
+        ]
+      }
+    },
+    {
+      "type": "download",
+      "parameters": {
+        "files": [
+          {
+            "url": "https://drive.usercontent.google.com/download?id=1a-DaT1LUuvh55GsjpJkhpjcDTh8CPzkl&export=download&authuser=0&confirm=t&uuid=2bfb7700-5222-47ad-9aee-e5c22b50dbfe&at=APZUnTW7RQR5HI9giEl-cirnSX3q:1705320977391",
+            "path": "Desktop/Baseball.mp3"
+          }
+        ]
+      }
+    },
+    {
+      "type": "open",
+      "parameters": {
+        "path": "Desktop/Mady_and_Mia_Baseball.pptx"
+      }
+    }
+  ],
   "trajectory": "trajectories/",
   "related_apps": [
-    ""
+    "libreoffice_impress"
   ],
-  "evaluator": "evaluation_dir"
-}
+  "evaluator": {
+    "func": "check_for_audio",
+    "result": {
+      "type": "vm_file",
+      "path": "Desktop/Mady_and_Mia_Baseball.pptx",
+      "dest": "Mady_and_Mia_Baseball.pptx"
+    }
+  }
+}
\ No newline at end of file

From 8efa6929511471ebd05e69b5e81c2ab520c442b5 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 11:58:23 +0800
Subject: [PATCH 3/6] Add raw accessibility-tree based prompting method (but
 the tokens are too large); Minor fix some small bugs

---
 desktop_env/envs/desktop_env.py               |   5 +-
 desktop_env/evaluators/metrics/general.py     |   2 +-
 experiment.py                                 |   2 +-
 mm_agents/gemini_agent_text.py                |   0
 mm_agents/gemini_pro_agent.py                 | 110 +++++++++++++
 ...ni_agent.py => gemini_pro_vision_agent.py} |   4 +-
 mm_agents/gpt_4_agent.py                      | 150 ++++++++++++++++++
 mm_agents/gpt_4v_agent.py                     |   2 +
 mm_agents/gpt_4v_agent_text.py                |   0
 mm_agents/gpt_4v_prompt_code.py               |   1 +
 10 files changed, 272 insertions(+), 4 deletions(-)
 delete mode 100644 mm_agents/gemini_agent_text.py
 create mode 100644 mm_agents/gemini_pro_agent.py
 rename mm_agents/{gemini_agent.py => gemini_pro_vision_agent.py} (96%)
 create mode 100644 mm_agents/gpt_4_agent.py
 delete mode 100644 mm_agents/gpt_4v_agent_text.py

diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 8a79d72..786ed72 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -204,7 +204,10 @@ class DesktopEnv(gym.Env):
         time.sleep(5)
         logger.info("Environment setup complete.")
 
-        observation = {"screenshot": self._get_obs()}
+        observation = {
+            "screenshot": self._get_obs(),
+            "accessibility_tree": self.controller.get_accessibility_tree(),
+        }
         return observation
 
     def step(self, action, pause=0.5):
diff --git a/desktop_env/evaluators/metrics/general.py b/desktop_env/evaluators/metrics/general.py
index b0433c3..6246861 100644
--- a/desktop_env/evaluators/metrics/general.py
+++ b/desktop_env/evaluators/metrics/general.py
@@ -4,7 +4,7 @@ import functools
 import operator
 import re
 from numbers import Number
-from typing import Callable, Any
+from typing import Callable, Any, Union
 from typing import Dict, List, Pattern
 
 import lxml.etree
diff --git a/experiment.py b/experiment.py
index f6d1232..8e7f8b5 100644
--- a/experiment.py
+++ b/experiment.py
@@ -6,7 +6,7 @@ import sys
 
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.gpt_4v_agent import GPT4v_Agent
-from mm_agents.gemini_agent import GeminiPro_Agent
+from mm_agents.gemini_pro_agent import GeminiPro_Agent
 
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
diff --git a/mm_agents/gemini_agent_text.py b/mm_agents/gemini_agent_text.py
deleted file mode 100644
index e69de29..0000000
diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py
new file mode 100644
index 0000000..21a54e7
--- /dev/null
+++ b/mm_agents/gemini_pro_agent.py
@@ -0,0 +1,110 @@
+from typing import Dict, List
+
+import PIL.Image
+import google.generativeai as genai
+
+from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
+from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+
+
+class GeminiPro_Agent:
+    def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0,
+                 action_space="computer_13"):
+        genai.configure(api_key=api_key)
+        self.instruction = instruction
+        self.model = genai.GenerativeModel(model)
+        self.max_tokens = max_tokens
+        self.temperature = temperature
+        self.action_space = action_space
+
+        self.trajectory = [
+            {
+                "role": "system",
+                "parts": [
+                    {
+                        "computer_13": SYS_PROMPT_ACTION,
+                        "pyautogui": SYS_PROMPT_CODE
+                    }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
+                ]
+            }
+        ]
+
+    def predict(self, obs: Dict) -> List:
+        """
+        Predict the next action(s) based on the current observation.
+        Only support single-round conversation, only fill-in the last desktop screenshot.
+        """
+        accessibility_tree = obs["accessibility_tree"]
+        self.trajectory.append({
+            "role": "user",
+            "parts": ["Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)]
+        })
+
+        # todo: Remove this step once the Gemini supports multi-round conversation
+        all_message_str = ""
+        for i in range(len(self.trajectory)):
+            if i == 0:
+                all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
+            elif i % 2 == 1:
+                all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
+            else:
+                all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
+
+            all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
+
+        print("All message: >>>>>>>>>>>>>>>> ")
+        print(
+            all_message_str
+        )
+
+        message_for_gemini = {
+            "role": "user",
+            "parts": [all_message_str]
+        }
+
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["parts"][0])
+            if len(self.trajectory[i]["parts"]) > 1:
+                traj_to_show.append("screenshot_obs")
+
+        print("Trajectory:", traj_to_show)
+
+        response = self.model.generate_content(
+            message_for_gemini,
+            generation_config={
+                "max_output_tokens": self.max_tokens,
+                "temperature": self.temperature
+            }
+        )
+
+        try:
+            response_text = response.text
+        except:
+            return []
+
+        try:
+            actions = self.parse_actions(response_text)
+        except:
+            print("Failed to parse action from response:", response_text)
+            actions = []
+
+        return actions
+
+    def parse_actions(self, response: str):
+        # parse from the response
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)
+        else:
+            raise ValueError("Invalid action space: " + self.action_space)
+
+        # add action into the trajectory
+        self.trajectory.append({
+            "role": "assistant",
+            "parts": [response]
+        })
+
+        return actions
diff --git a/mm_agents/gemini_agent.py b/mm_agents/gemini_pro_vision_agent.py
similarity index 96%
rename from mm_agents/gemini_agent.py
rename to mm_agents/gemini_pro_vision_agent.py
index 1593c9a..e4bb9d1 100644
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_pro_vision_agent.py
@@ -8,7 +8,7 @@ from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 
 
-class GeminiPro_Agent:
+class GeminiProV_Agent:
     def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
                  action_space="computer_13"):
         genai.configure(api_key=api_key)
@@ -93,6 +93,8 @@ class GeminiPro_Agent:
             actions = parse_actions_from_string(response)
         elif self.action_space == "pyautogui":
             actions = parse_code_from_string(response)
+        else:
+            raise ValueError("Invalid action space: " + self.action_space)
 
         # add action into the trajectory
         self.trajectory.append({
diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py
new file mode 100644
index 0000000..8d2b82d
--- /dev/null
+++ b/mm_agents/gpt_4_agent.py
@@ -0,0 +1,150 @@
+import base64
+import json
+import re
+from typing import Dict, List
+
+import requests
+
+from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+
+
+# Function to encode the image
+def encode_image(image_path):
+    with open(image_path, "rb") as image_file:
+        return base64.b64encode(image_file.read()).decode('utf-8')
+
+
+def parse_actions_from_string(input_string):
+    # Search for a JSON string within the input string
+    actions = []
+    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+    if matches:
+        # Assuming there's only one match, parse the JSON string into a dictionary
+        try:
+            for match in matches:
+                action_dict = json.loads(match)
+                actions.append(action_dict)
+            return actions
+        except json.JSONDecodeError as e:
+            return f"Failed to parse JSON: {e}"
+    else:
+        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+        if matches:
+            # Assuming there's only one match, parse the JSON string into a dictionary
+            try:
+                for match in matches:
+                    action_dict = json.loads(match)
+                    actions.append(action_dict)
+                return actions
+            except json.JSONDecodeError as e:
+                return f"Failed to parse JSON: {e}"
+        else:
+            try:
+                action_dict = json.loads(input_string)
+                return [action_dict]
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid response format: " + input_string)
+
+
+def parse_code_from_string(input_string):
+    # This regular expression will match both ```code``` and ```python code```
+    # and capture the `code` part. It uses a non-greedy match for the content inside.
+    pattern = r"```(?:\w+\s+)?(.*?)```"
+    # Find all non-overlapping matches in the string
+    matches = re.findall(pattern, input_string, re.DOTALL)
+
+    # The regex above captures the content inside the triple backticks.
+    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
+    # so the code inside backticks can span multiple lines.
+
+    # matches now contains all the captured code snippets
+    return matches
+
+
+class GPT4_Agent:
+    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"):
+        self.instruction = instruction
+        self.model = model
+        self.max_tokens = max_tokens
+        self.action_space = action_space
+
+        self.headers = {
+            "Content-Type": "application/json",
+            "Authorization": f"Bearer {api_key}"
+        }
+
+        self.trajectory = [
+            {
+                "role": "system",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": {
+                            "computer_13": SYS_PROMPT_ACTION,
+                            "pyautogui": SYS_PROMPT_CODE
+                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
+                    },
+                ]
+            }
+        ]
+
+    def predict(self, obs: Dict) -> List:
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        accessibility_tree = obs["accessibility_tree"]
+        self.trajectory.append({
+            "role": "user",
+            "content": [
+                {
+                    "type": "text",
+                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)
+                }
+            ]
+        })
+
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
+            if len(self.trajectory[i]["content"]) > 1:
+                traj_to_show.append("screenshot_obs")
+
+        print("Trajectory:", traj_to_show)
+
+        payload = {
+            "model": self.model,
+            "messages": self.trajectory,
+            "max_tokens": self.max_tokens
+        }
+        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
+
+        try:
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+        except:
+            print("Failed to parse action from response:", response.json())
+            actions = None
+
+        return actions
+
+    def parse_actions(self, response: str):
+        # parse from the response
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)
+        else:
+            raise ValueError("Invalid action space: " + self.action_space)
+
+        # add action into the trajectory
+        self.trajectory.append({
+            "role": "assistant",
+            "content": [
+                {
+                    "type": "text",
+                    "text": response
+                },
+            ]
+        })
+
+        return actions
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 81d128e..fdf6adb 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -139,6 +139,8 @@ class GPT4v_Agent:
             actions = parse_actions_from_string(response)
         elif self.action_space == "pyautogui":
             actions = parse_code_from_string(response)
+        else:
+            raise ValueError("Invalid action space: " + self.action_space)
 
         # add action into the trajectory
         self.trajectory.append({
diff --git a/mm_agents/gpt_4v_agent_text.py b/mm_agents/gpt_4v_agent_text.py
deleted file mode 100644
index e69de29..0000000
diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py
index aa768e9..8dba73b 100644
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -2,6 +2,7 @@ SYS_PROMPT = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
+
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
 You ONLY need to return the code inside a code block, like this:

From 48a86d36cf84556933452dced412dc4b08261d65 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 12:15:21 +0800
Subject: [PATCH 4/6] Minor updates

---
 README.md                        |   9 +-
 experiment_pure_text.py          | 131 +++++++++++++++++
 mm_agents/gpt_4_prompt_action.py | 244 +++++++++++++++++++++++++++++++
 mm_agents/gpt_4_prompt_code.py   |  18 +++
 4 files changed, 401 insertions(+), 1 deletion(-)
 create mode 100644 experiment_pure_text.py
 create mode 100644 mm_agents/gpt_4_prompt_action.py
 create mode 100644 mm_agents/gpt_4_prompt_code.py

diff --git a/README.md b/README.md
index b7d56df..5fd8aa6 100644
--- a/README.md
+++ b/README.md
@@ -23,4 +23,11 @@ todo
 - [x] Error handling during file passing and file opening, etc.
 - [x] Add accessibility tree from the OS into the observation space
 - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
-- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
\ No newline at end of file
+- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
+
+## Road map of benchmark, tools and resources (Proposed)
+- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree
+- [ ] Annotate the steps of doing the task
+- [ ] Build a website for the project
+- [ ] Crawl all resources we explored from the internet, and make it easy to access
+- [ ] Set up ways for community to contribute new examples
diff --git a/experiment_pure_text.py b/experiment_pure_text.py
new file mode 100644
index 0000000..011c7bf
--- /dev/null
+++ b/experiment_pure_text.py
@@ -0,0 +1,131 @@
+import datetime
+import json
+import logging
+import os
+import sys
+
+from desktop_env.envs.desktop_env import DesktopEnv
+from mm_agents.gpt_4_agent import GPT4_Agent
+from mm_agents.gemini_pro_agent import GeminiPro_Agent
+
+#  Logger Configs {{{ # 
+logger = logging.getLogger()
+logger.setLevel(logging.DEBUG)
+
+datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+
+file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
+debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
+stdout_handler = logging.StreamHandler(sys.stdout)
+sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
+
+file_handler.setLevel(logging.INFO)
+debug_handler.setLevel(logging.DEBUG)
+stdout_handler.setLevel(logging.INFO)
+sdebug_handler.setLevel(logging.DEBUG)
+
+formatter = logging.Formatter(
+    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
+file_handler.setFormatter(formatter)
+debug_handler.setFormatter(formatter)
+stdout_handler.setFormatter(formatter)
+sdebug_handler.setFormatter(formatter)
+
+stdout_handler.addFilter(logging.Filter("desktopenv"))
+sdebug_handler.addFilter(logging.Filter("desktopenv"))
+
+logger.addHandler(file_handler)
+logger.addHandler(debug_handler)
+logger.addHandler(stdout_handler)
+logger.addHandler(sdebug_handler)
+#  }}} Logger Configs # 
+
+logger = logging.getLogger("desktopenv.experiment")
+
+PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
+
+
+def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
+    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
+    env = DesktopEnv(
+        path_to_vm=PATH_TO_VM,
+        action_space=agent.action_space,
+        task_config=example
+    )
+    # reset the environment to certain snapshot
+    observation = env.reset()
+    done = False
+    step_num = 0
+
+    if recording:
+        # send a request to the server to start recording
+        env.controller.start_recording()
+
+    while not done and step_num < max_steps:
+        actions = agent.predict(observation)
+        step_num += 1
+        for action in actions:
+            # Capture the timestamp before executing the action
+            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
+            logger.info("Step %d: %s", step_num, action)
+
+            observation, reward, done, info = env.step(action)
+
+            logger.info("Reward: %.2f", reward)
+            logger.info("Done: %s", done)
+            logger.info("Info: %s", info)
+
+            # Save screenshot and trajectory information
+            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
+                with open(observation['screenshot'], "rb") as __f:
+                    screenshot = __f.read()
+                _f.write(screenshot)
+
+            with open(trajectory_recording_path, "a") as f:
+                f.write(json.dumps({
+                    "step_num": step_num,
+                    "action_timestamp": action_timestamp,
+                    "action": action,
+                    "reward": reward,
+                    "done": done,
+                    "info": info,
+                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
+                }))
+                f.write("\n")
+
+            if done:
+                logger.info("The episode is done.")
+                break
+
+    if recording:
+        # send a request to the server to stop recording
+        env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
+
+    result = env.evaluate()
+    logger.info("Result: %.2f", result)
+
+    # env.close()
+    logger.info("Environment closed.")
+
+
+if __name__ == "__main__":
+    action_space = "pyautogui"
+    example_class = "chrome"
+    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
+
+    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
+        example = json.load(f)
+    example["snapshot"] = "exp_setup2"
+
+    # api_key = os.environ.get("OPENAI_API_KEY")
+    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    api_key = os.environ.get("GENAI_API_KEY")
+    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+
+    root_trajectory_dir = "exp_trajectory"
+
+    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
+    os.makedirs(example_trajectory_dir, exist_ok=True)
+
+    run_one_example(example, agent, 10, example_trajectory_dir)
diff --git a/mm_agents/gpt_4_prompt_action.py b/mm_agents/gpt_4_prompt_action.py
new file mode 100644
index 0000000..3019074
--- /dev/null
+++ b/mm_agents/gpt_4_prompt_action.py
@@ -0,0 +1,244 @@
+SYS_PROMPT = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+REMEMBER:
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
+"""
\ No newline at end of file
diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py
new file mode 100644
index 0000000..b057da6
--- /dev/null
+++ b/mm_agents/gpt_4_prompt_code.py
@@ -0,0 +1,18 @@
+SYS_PROMPT = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+"""
\ No newline at end of file

From 186bf2e97cdb69d1b628d0a8fefab990b19d9ee7 Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 16:43:32 +0800
Subject: [PATCH 5/6] Implement heuristic cutting on the accessibility tree to
 get the important nodes; Finish accessibility tree text agent

---
 desktop_env/controllers/python.py             |   2 +
 experiment_pure_text.py                       |  12 +-
 mm_agents/accessibility_tree_wrap/__init__.py |   0
 .../heuristic_retrieve.py                     | 115 ++++++++++++++++++
 .../relevant_retrieve.py                      |   0
 mm_agents/gemini_pro_agent.py                 |  44 +++++--
 mm_agents/gemini_pro_vision_agent.py          |  22 ++--
 mm_agents/gpt_4_agent.py                      |  36 +++++-
 mm_agents/gpt_4_prompt_code.py                |   2 +-
 mm_agents/gpt_4v_agent.py                     |  17 ++-
 mm_agents/gpt_4v_prompt_code.py               |   2 +-
 11 files changed, 218 insertions(+), 34 deletions(-)
 create mode 100644 mm_agents/accessibility_tree_wrap/__init__.py
 create mode 100644 mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
 create mode 100644 mm_agents/accessibility_tree_wrap/relevant_retrieve.py

diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 1c37d4a..a4b2abb 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -83,6 +83,8 @@ class PythonController:
         """
         Executes an action on the server computer.
         """
+        if action in ['WAIT', 'FAIL', 'DONE']:
+            return
 
         action_type = action["action_type"]
         parameters = action["parameters"] if "parameters" in action else {}
diff --git a/experiment_pure_text.py b/experiment_pure_text.py
index 011c7bf..4ab5c97 100644
--- a/experiment_pure_text.py
+++ b/experiment_pure_text.py
@@ -111,17 +111,17 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
 if __name__ == "__main__":
     action_space = "pyautogui"
     example_class = "chrome"
-    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
+    example_id = "06fe7178-4491-4589-810f-2e2bc9502122"
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
-    example["snapshot"] = "exp_setup2"
+    example["snapshot"] = "exp_setup4"
 
-    # api_key = os.environ.get("OPENAI_API_KEY")
-    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    api_key = os.environ.get("OPENAI_API_KEY")
+    agent = GPT4_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
 
-    api_key = os.environ.get("GENAI_API_KEY")
-    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    # api_key = os.environ.get("GENAI_API_KEY")
+    # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
 
     root_trajectory_dir = "exp_trajectory"
 
diff --git a/mm_agents/accessibility_tree_wrap/__init__.py b/mm_agents/accessibility_tree_wrap/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
new file mode 100644
index 0000000..7e4a74e
--- /dev/null
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -0,0 +1,115 @@
+import xml.etree.ElementTree as ET
+
+from PIL import Image, ImageDraw, ImageFont
+
+
+def find_leaf_nodes(xml_file_path):
+    root = ET.fromstring(xml_file_path)
+
+    # Recursive function to traverse the XML tree and collect leaf nodes
+    def collect_leaf_nodes(node, leaf_nodes):
+        # If the node has no children, it is a leaf node, add it to the list
+        if not list(node):
+            leaf_nodes.append(node)
+        # If the node has children, recurse on each child
+        for child in node:
+            collect_leaf_nodes(child, leaf_nodes)
+
+    # List to hold all leaf nodes
+    leaf_nodes = []
+    collect_leaf_nodes(root, leaf_nodes)
+    return leaf_nodes
+
+
+def filter_nodes(nodes):
+    filtered_nodes = []
+
+    for node in nodes:
+        if not node.get('{uri:deskat:state.at-spi.gnome.org}visible', None) == 'true':
+            # Not visible
+            continue
+        # Check if the node is a 'panel'
+        if node.tag == 'panel':
+            # Check if the 'panel' represents an interactive element
+            # or if it has certain attributes that are of interest.
+            # Add your conditions here...
+            if node.get('{uri:deskat:state.at-spi.gnome.org}focusable', 'false') == 'true':
+                filtered_nodes.append(node)
+        elif node.tag == 'text':
+            continue
+        else:
+            coords = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', ')))
+            if coords[0] < 0 or coords[1] < 0:
+                continue
+            size = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', ')))
+            if size[0] <= 0 or size[1] <= 0:
+                continue
+            # Node is not a 'panel', add to the list.
+            filtered_nodes.append(node)
+
+    return filtered_nodes
+
+
+def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
+    # Load the screenshot image
+    image = Image.open(image_file_path)
+    draw = ImageDraw.Draw(image)
+
+    # Optional: Load a font. If you don't specify a font, a default one will be used.
+    try:
+        # Adjust the path to the font file you have or use a default one
+        font = ImageFont.truetype("arial.ttf", 20)
+    except IOError:
+        # Fallback to a basic font if the specified font can't be loaded
+        font = ImageFont.load_default()
+
+    # Loop over all the visible nodes and draw their bounding boxes
+    for index, _node in enumerate(nodes):
+        coords_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord')
+        size_str = _node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size')
+
+        if coords_str and size_str:
+            try:
+                # Parse the coordinates and size from the strings
+                coords = tuple(map(int, coords_str.strip('()').split(', ')))
+                size = tuple(map(int, size_str.strip('()').split(', ')))
+
+                # Check for negative sizes
+                if size[0] <= 0 or size[1] <= 0:
+                    raise ValueError(f"Size must be positive, got: {size}")
+
+                # Calculate the bottom-right corner of the bounding box
+                bottom_right = (coords[0] + size[0], coords[1] + size[1])
+
+                # Check that bottom_right > coords (x1 >= x0, y1 >= y0)
+                if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
+                    raise ValueError(f"Invalid coordinates or size, coords: {coords}, size: {size}")
+
+                # Draw rectangle on image
+                draw.rectangle([coords, bottom_right], outline="red", width=2)
+
+                # Draw index number at the bottom left of the bounding box
+                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
+                draw.text(text_position, str(index), font=font, fill="purple")
+
+            except ValueError as e:
+                pass
+
+    # Save the result
+    image.save(output_image_file_path)
+
+
+if __name__ == '__main__':
+    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
+        xml_string = f.read()
+    image_file_path = 'screenshot.png'  # Replace with your actual screenshot image path
+    output_image_file_path = 'annotated_screenshot.png'  # Replace with your desired output image path
+
+    leaf_nodes = find_leaf_nodes(xml_string)
+    filtered_nodes = filter_nodes(leaf_nodes)
+    print(f"Found {len(filtered_nodes)} filtered nodes")
+
+    for node in filtered_nodes:
+        print(node.tag, node.attrib)
+
+    draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)
diff --git a/mm_agents/accessibility_tree_wrap/relevant_retrieve.py b/mm_agents/accessibility_tree_wrap/relevant_retrieve.py
new file mode 100644
index 0000000..e69de29
diff --git a/mm_agents/gemini_pro_agent.py b/mm_agents/gemini_pro_agent.py
index 21a54e7..26f9c0e 100644
--- a/mm_agents/gemini_pro_agent.py
+++ b/mm_agents/gemini_pro_agent.py
@@ -1,11 +1,12 @@
+import time
 from typing import Dict, List
 
-import PIL.Image
 import google.generativeai as genai
 
-from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
 from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
 
 
 class GeminiPro_Agent:
@@ -36,9 +37,25 @@ class GeminiPro_Agent:
         Only support single-round conversation, only fill-in the last desktop screenshot.
         """
         accessibility_tree = obs["accessibility_tree"]
+
+        leaf_nodes = find_leaf_nodes(accessibility_tree)
+        filtered_nodes = filter_nodes(leaf_nodes)
+
+        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
+        # Linearize the accessibility tree nodes into a table format
+
+        for node in filtered_nodes:
+            linearized_accessibility_tree += node.tag + "\t"
+            linearized_accessibility_tree += node.attrib.get('name') + "\t"
+            linearized_accessibility_tree += node.attrib.get(
+                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
+            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
+
         self.trajectory.append({
             "role": "user",
-            "parts": ["Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)]
+            "parts": [
+                "Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                    linearized_accessibility_tree)]
         })
 
         # todo: Remove this step once the Gemini supports multi-round conversation
@@ -71,13 +88,20 @@ class GeminiPro_Agent:
 
         print("Trajectory:", traj_to_show)
 
-        response = self.model.generate_content(
-            message_for_gemini,
-            generation_config={
-                "max_output_tokens": self.max_tokens,
-                "temperature": self.temperature
-            }
-        )
+        while True:
+            try:
+                response = self.model.generate_content(
+                    message_for_gemini,
+                    generation_config={
+                        "max_output_tokens": self.max_tokens,
+                        "temperature": self.temperature
+                    }
+                )
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass
 
         try:
             response_text = response.text
diff --git a/mm_agents/gemini_pro_vision_agent.py b/mm_agents/gemini_pro_vision_agent.py
index e4bb9d1..2d5d365 100644
--- a/mm_agents/gemini_pro_vision_agent.py
+++ b/mm_agents/gemini_pro_vision_agent.py
@@ -1,3 +1,4 @@
+import time
 from typing import Dict, List
 
 import PIL.Image
@@ -66,13 +67,20 @@ class GeminiProV_Agent:
 
         print("Trajectory:", traj_to_show)
 
-        response = self.model.generate_content(
-            message_for_gemini,
-            generation_config={
-                "max_output_tokens": self.max_tokens,
-                "temperature": self.temperature
-            }
-        )
+        while True:
+            try:
+                response = self.model.generate_content(
+                    message_for_gemini,
+                    generation_config={
+                        "max_output_tokens": self.max_tokens,
+                        "temperature": self.temperature
+                    }
+                )
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass
 
         try:
             response_text = response.text
diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py
index 8d2b82d..57a1634 100644
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -1,10 +1,12 @@
 import base64
 import json
 import re
+import time
 from typing import Dict, List
 
 import requests
 
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
 from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
 from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
 
@@ -81,9 +83,9 @@ class GPT4_Agent:
                     {
                         "type": "text",
                         "text": {
-                            "computer_13": SYS_PROMPT_ACTION,
-                            "pyautogui": SYS_PROMPT_CODE
-                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
+                                    "computer_13": SYS_PROMPT_ACTION,
+                                    "pyautogui": SYS_PROMPT_CODE
+                                }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                     },
                 ]
             }
@@ -94,12 +96,27 @@ class GPT4_Agent:
         Predict the next action(s) based on the current observation.
         """
         accessibility_tree = obs["accessibility_tree"]
+
+        leaf_nodes = find_leaf_nodes(accessibility_tree)
+        filtered_nodes = filter_nodes(leaf_nodes)
+
+        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
+        # Linearize the accessibility tree nodes into a table format
+
+        for node in filtered_nodes:
+            linearized_accessibility_tree += node.tag + "\t"
+            linearized_accessibility_tree += node.attrib.get('name') + "\t"
+            linearized_accessibility_tree += node.attrib.get(
+                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
+            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
+
         self.trajectory.append({
             "role": "user",
             "content": [
                 {
                     "type": "text",
-                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)
+                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                        linearized_accessibility_tree)
                 }
             ]
         })
@@ -117,7 +134,16 @@ class GPT4_Agent:
             "messages": self.trajectory,
             "max_tokens": self.max_tokens
         }
-        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
+
+        while True:
+            try:
+                response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
+                                         json=payload)
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass
 
         try:
             actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py
index b057da6..25e4083 100644
--- a/mm_agents/gpt_4_prompt_code.py
+++ b/mm_agents/gpt_4_prompt_code.py
@@ -11,7 +11,7 @@ You ONLY need to return the code inside a code block, like this:
 ```
 Specially, it is also allowed to return the following special code:
 When you think you have to wait for some time, return ```WAIT```;
-When you think the task can not be done, return ```FAIL```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 
 First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index fdf6adb..d594b76 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,6 +1,7 @@
 import base64
 import json
 import re
+import time
 from typing import Dict, List
 
 import requests
@@ -81,9 +82,9 @@ class GPT4v_Agent:
                     {
                         "type": "text",
                         "text": {
-                            "computer_13": SYS_PROMPT_ACTION,
-                            "pyautogui": SYS_PROMPT_CODE
-                        }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
+                                    "computer_13": SYS_PROMPT_ACTION,
+                                    "pyautogui": SYS_PROMPT_CODE
+                                }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
                     },
                 ]
             }
@@ -123,8 +124,16 @@ class GPT4v_Agent:
             "messages": self.trajectory,
             "max_tokens": self.max_tokens
         }
-        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
 
+        while True:
+            try:
+                response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
+                                         json=payload)
+                break
+            except:
+                print("Failed to generate response, retrying...")
+                time.sleep(5)
+                pass
         try:
             actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
         except:
diff --git a/mm_agents/gpt_4v_prompt_code.py b/mm_agents/gpt_4v_prompt_code.py
index 8dba73b..8f256da 100644
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -11,7 +11,7 @@ You ONLY need to return the code inside a code block, like this:
 ```
 Specially, it is also allowed to return the following special code:
 When you think you have to wait for some time, return ```WAIT```;
-When you think the task can not be done, return ```FAIL```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
 When you think the task is done, return ```DONE```.
 
 First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.

From 20b1d950a0b06de827e809eec548929992e20a6e Mon Sep 17 00:00:00 2001
From: Timothyxxx <384084775@qq.com>
Date: Tue, 16 Jan 2024 22:00:01 +0800
Subject: [PATCH 6/6] FIx corner cases (val connection in chrome when using
 playwright, and action parsing for agent, and accessibility tree xml
 handling)

---
 desktop_env/evaluators/getters/chrome.py      | 16 ++++++++---
 experiment_pure_text.py                       | 14 ++++++----
 .../heuristic_retrieve.py                     | 23 ++++------------
 mm_agents/gpt_4_agent.py                      | 27 ++++++++++++++++---
 mm_agents/gui_som/READAME.md                  |  1 +
 5 files changed, 51 insertions(+), 30 deletions(-)
 create mode 100644 mm_agents/gui_som/READAME.md

diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py
index 1b77016..d58d08b 100644
--- a/desktop_env/evaluators/getters/chrome.py
+++ b/desktop_env/evaluators/getters/chrome.py
@@ -159,9 +159,19 @@ def get_open_tabs_info(env, config: Dict[str, str]):
         tabs_info = []
         for context in browser.contexts:
             for page in context.pages:
-                title = page.title()
-                url = page.url
-                tabs_info.append({'title': title, 'url': url})
+                try:
+                    # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
+                    page.wait_for_load_state('load')  # Wait for the 'load' event to complete
+                    title = page.title()
+                    url = page.url
+                    tabs_info.append({'title': title, 'url': url})
+                except TimeoutError:
+                    # If page loading times out, catch the exception and store the current information in the list
+                    tabs_info.append({'title': 'Load timeout', 'url': page.url})
+                except Exception as e:
+                    # Catch other potential exceptions that might occur while reading the page title
+                    print(f'Error: {e}')
+                    tabs_info.append({'title': 'Error encountered', 'url': page.url})
 
         browser.close()
         return tabs_info
diff --git a/experiment_pure_text.py b/experiment_pure_text.py
index 4ab5c97..cfcbd46 100644
--- a/experiment_pure_text.py
+++ b/experiment_pure_text.py
@@ -111,21 +111,25 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
 if __name__ == "__main__":
     action_space = "pyautogui"
     example_class = "chrome"
-    example_id = "06fe7178-4491-4589-810f-2e2bc9502122"
+    example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263"
+    gpt4_model = "gpt-4-1106-preview"
+    gemini_model = "gemini-pro-vision"
 
     with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
         example = json.load(f)
     example["snapshot"] = "exp_setup4"
 
     api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space)
 
     # api_key = os.environ.get("GENAI_API_KEY")
-    # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
+    # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
 
     root_trajectory_dir = "exp_trajectory"
 
-    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
+    example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id)
+    # example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id)
+
     os.makedirs(example_trajectory_dir, exist_ok=True)
 
-    run_one_example(example, agent, 10, example_trajectory_dir)
+    run_one_example(example, agent, 15, example_trajectory_dir)
diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
index 7e4a74e..d6f83eb 100644
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -3,8 +3,11 @@ import xml.etree.ElementTree as ET
 from PIL import Image, ImageDraw, ImageFont
 
 
-def find_leaf_nodes(xml_file_path):
-    root = ET.fromstring(xml_file_path)
+def find_leaf_nodes(xlm_file_str):
+    if not xlm_file_str:
+        return []
+
+    root = ET.fromstring(xlm_file_str)
 
     # Recursive function to traverse the XML tree and collect leaf nodes
     def collect_leaf_nodes(node, leaf_nodes):
@@ -97,19 +100,3 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
 
     # Save the result
     image.save(output_image_file_path)
-
-
-if __name__ == '__main__':
-    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
-        xml_string = f.read()
-    image_file_path = 'screenshot.png'  # Replace with your actual screenshot image path
-    output_image_file_path = 'annotated_screenshot.png'  # Replace with your desired output image path
-
-    leaf_nodes = find_leaf_nodes(xml_string)
-    filtered_nodes = filter_nodes(leaf_nodes)
-    print(f"Found {len(filtered_nodes)} filtered nodes")
-
-    for node in filtered_nodes:
-        print(node.tag, node.attrib)
-
-    draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)
diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py
index 57a1634..aa19185 100644
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -61,11 +61,27 @@ def parse_code_from_string(input_string):
     # so the code inside backticks can span multiple lines.
 
     # matches now contains all the captured code snippets
-    return matches
+
+    codes = []
+
+    for match in matches:
+        match = match.strip()
+        commands = ['WAIT', 'DONE', 'FAIL']  # fixme: updates this part when we have more commands
+
+        if match in commands:
+            codes.append(match.strip())
+        elif match.split('\n')[-1] in commands:
+            if len(match.split('\n')) > 1:
+                codes.append("\n".join(match.split('\n')[:-1]))
+            codes.append(match.split('\n')[-1])
+        else:
+            codes.append(match)
+
+    return codes
 
 
 class GPT4_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"):
+    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"):
         self.instruction = instruction
         self.model = model
         self.max_tokens = max_tokens
@@ -121,14 +137,17 @@ class GPT4_Agent:
             ]
         })
 
+        # print(
+        #     "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+        #         linearized_accessibility_tree)
+        # )
+
         traj_to_show = []
         for i in range(len(self.trajectory)):
             traj_to_show.append(self.trajectory[i]["content"][0]["text"])
             if len(self.trajectory[i]["content"]) > 1:
                 traj_to_show.append("screenshot_obs")
 
-        print("Trajectory:", traj_to_show)
-
         payload = {
             "model": self.model,
             "messages": self.trajectory,
diff --git a/mm_agents/gui_som/READAME.md b/mm_agents/gui_som/READAME.md
new file mode 100644
index 0000000..05b15ba
--- /dev/null
+++ b/mm_agents/gui_som/READAME.md
@@ -0,0 +1 @@
+Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.
\ No newline at end of file