Initialize all baselines: screenshot, a11y tree, both, SoM, SeeAct

2024-01-20 00:13:46 +08:00
parent 46bd3386dd
commit 09f3e776ae
14 changed files with 2588 additions and 1208 deletions
--- a/experiment_pure_text.py
+++ b/experiment_pure_text.py
@@ -62,6 +62,8 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
        env.controller.start_recording()

    while not done and step_num < max_steps:
+        with open("accessibility_tree.xml", "w", encoding="utf-8") as f:
+            f.write(observation["accessibility_tree"])
        actions = agent.predict(observation)
        step_num += 1
        for action in actions:
--- a/experiment_screenshot.py
+++ b/experiment_screenshot.py
--- a/mm_agents/SoM_agent.py
+++ b/mm_agents/SoM_agent.py
@@ -1,283 +0,0 @@
-# fixme: Need to be rewrite on new action space
-
-import os
-import re
-import base64
-import PIL.Image
-import json
-import requests
-
-import torch
-import argparse
-
-# seem
-from seem.modeling.BaseModel import BaseModel as BaseModel_Seem
-from seem.utils.distributed import init_distributed as init_distributed_seem
-from seem.modeling import build_model as build_model_seem
-from task_adapter.seem.tasks import inference_seem_pano
-
-# semantic sam
-from semantic_sam.BaseModel import BaseModel
-from semantic_sam import build_model
-from semantic_sam.utils.dist import init_distributed_mode
-from semantic_sam.utils.arguments import load_opt_from_config_file
-from semantic_sam.utils.constants import COCO_PANOPTIC_CLASSES
-from task_adapter.semantic_sam.tasks import inference_semsam_m2m_auto, prompt_switch
-
-# sam
-from segment_anything import sam_model_registry
-from task_adapter.sam.tasks.inference_sam_m2m_auto import inference_sam_m2m_auto
-
-from scipy.ndimage import label
-from io import BytesIO
-import numpy as np
-
-SYS_PROMPT = '''
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-
-Firstly you need to predict the class of your action, select from one below:
- **CLICK**: click on the screen with the specified integer label
- **TYPE**: type a string on the keyboard
-
- For CLICK, you need to predict the correct integer label shown on the screenshot
-for example, format as:
-```
-{
-  "action_type": "CLICK",
-  "label": 7
-}
-```
- For TYPE, you need to specify the text you want to type
-for example, format as:
-```
-{
-  "action_type": "TYPE",
-  "text": "hello world"
-}
-```
-
-For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
-You can predict multiple actions at one step, but you should only return one action for each step.
-You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
-'''
-
-# build args
-semsam_cfg = "configs/semantic_sam_only_sa-1b_swinL.yaml"
-seem_cfg = "configs/seem_focall_unicl_lang_v1.yaml"
-
-semsam_ckpt = "./swinl_only_sam_many2many.pth"
-sam_ckpt = "./sam_vit_h_4b8939.pth"
-seem_ckpt = "./seem_focall_v1.pt"
-
-opt_semsam = load_opt_from_config_file(semsam_cfg)
-opt_seem = load_opt_from_config_file(seem_cfg)
-opt_seem = init_distributed_seem(opt_seem)
-
-# build model
-model_semsam = BaseModel(opt_semsam, build_model(opt_semsam)).from_pretrained(semsam_ckpt).eval().cuda()
-model_sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt).eval().cuda()
-model_seem = BaseModel_Seem(opt_seem, build_model_seem(opt_seem)).from_pretrained(seem_ckpt).eval().cuda()
-
-with torch.no_grad():
-    with torch.autocast(device_type='cuda', dtype=torch.float16):
-        model_seem.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(COCO_PANOPTIC_CLASSES + ["background"], is_eval=True)
-
-@torch.no_grad()
-def inference(image, slider, mode, alpha, label_mode, anno_mode, *args, **kwargs):
-    if slider < 1.5:
-        model_name = 'seem'
-    elif slider > 2.5:
-        model_name = 'sam'
-    else:
-        model_name = 'semantic-sam'
-        if slider < 1.5 + 0.14:
-            level = [1]
-        elif slider < 1.5 + 0.28:
-            level = [2]
-        elif slider < 1.5 + 0.42:
-            level = [3]
-        elif slider < 1.5 + 0.56:
-            level = [4]
-        elif slider < 1.5 + 0.70:
-            level = [5]
-        elif slider < 1.5 + 0.84:
-            level = [6]
-        else:
-            level = [6, 1, 2, 3, 4, 5]
-
-    if label_mode == 'Alphabet':
-        label_mode = 'a'
-    else:
-        label_mode = '1'
-
-    text_size, hole_scale, island_scale = 1280, 100, 100
-    text, text_part, text_thresh = '', '', '0.0'
-
-    with torch.autocast(device_type='cuda', dtype=torch.float16):
-        semantic = False
-
-        if model_name == 'semantic-sam':
-            model = model_semsam
-            output, mask = inference_semsam_m2m_auto(model, image, level, text, text_part, text_thresh, text_size, hole_scale, island_scale, semantic, label_mode=label_mode, alpha=alpha, anno_mode=anno_mode, *args, **kwargs)
-
-        elif model_name == 'sam':
-            model = model_sam
-            output, mask = inference_sam_m2m_auto(model, image, text_size, label_mode, alpha, anno_mode)
-
-        elif model_name == 'seem':
-            model = model_seem
-            output, mask = inference_seem_pano(model, image, text_size, label_mode, alpha, anno_mode)
-
-        return output, mask
-
-# Function to encode the image
-def encode_image(image):
-    pil_img = PIL.Image.fromarray(image)
-    buff = BytesIO()
-    pil_img.save(buff, format="JPEG")
-    new_image_string = base64.b64encode(buff.getvalue()).decode("utf-8")
-    return new_image_string
-
-def parse_actions_from_string(input_string):
-    # Search for a JSON string within the input string
-    actions = []
-    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
-    if matches:
-        # Assuming there's only one match, parse the JSON string into a dictionary
-        try:
-            for match in matches:
-                action_dict = json.loads(match)
-                actions.append(action_dict)
-            return actions
-        except json.JSONDecodeError as e:
-            return f"Failed to parse JSON: {e}"
-    else:
-        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
-        if matches:
-            # Assuming there's only one match, parse the JSON string into a dictionary
-            try:
-                for match in matches:
-                    action_dict = json.loads(match)
-                    actions.append(action_dict)
-                return actions
-            except json.JSONDecodeError as e:
-                return f"Failed to parse JSON: {e}"
-        else:
-            try:
-                action_dict = json.loads(input_string)
-                return [action_dict]
-            except json.JSONDecodeError as e:
-                raise ValueError("Invalid response format: " + input_string)
-
-class GPT4v_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
-        self.instruction = instruction
-        self.model = model
-        self.max_tokens = max_tokens
-
-        self.headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}"
-        }
-
-        self.trajectory = [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": SYS_PROMPT
-                    },
-                ]
-            }
-        ]
-
-    def predict(self, obs):
-        obs, mask = inference(obs, slider=3.0, mode="Automatic", alpha=0.1, label_mode="Number", anno_mode=["Mark", "Box"])
-        PIL.Image.fromarray(obs).save("desktop.jpeg")
-        base64_image = encode_image(obs)
-        self.trajectory.append({
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "What's the next step for instruction '{}'?".format(self.instruction)
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_image}"
-                    }
-                }
-            ]
-        })
-        traj_to_show = []
-        for i in range(len(self.trajectory)):
-            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
-            if len(self.trajectory[i]["content"]) > 1:
-                traj_to_show.append("screenshot_obs")
-        print("Trajectory:", traj_to_show)
-        payload = {
-            "model": self.model,
-            "messages": self.trajectory,
-            "max_tokens": self.max_tokens
-        }
-        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
-
-        try:
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'], mask)
-        except:
-            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
-            actions = None
-
-        return actions
-
-    def parse_actions(self, response: str, mask):
-        # response example
-        """
-        ```json
-        {
-          "action_type": "CLICK",
-          "click_type": "RIGHT"
-        }
-        ```
-        """
-
-        # parse from the response
-        actions = parse_actions_from_string(response)
-        print(actions)
-
-        # add action into the trajectory
-        self.trajectory.append({
-            "role": "assistant",
-            "content": [
-                {
-                    "type": "text",
-                    "text": response
-                },
-            ]
-        })
-
-        # parse action
-        parsed_actions = []
-        for action in actions:
-            action_type = action['action_type']
-            if action_type == "CLICK":
-                label = int(action['label'])
-                x, y, w, h = mask[label-1]['bbox']
-                parsed_actions.append({"action_type": action_type, "x": int(x + w//2) , "y": int(y + h//2)})
-
-            if action_type == "TYPE":
-                parsed_actions.append({"action_type": action_type, "text": action["text"]})
-
-        return parsed_actions
-
-
-if __name__ == '__main__':
-    # OpenAI API Key
-    api_key = os.environ.get("OPENAI_API_KEY")
-
-    agent = GPT4v_Agent(api_key=api_key, instruction="Open Firefox")
-    obs = PIL.Image.open('desktop.png')
-    print(agent.predict(obs=obs))
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -41,10 +41,12 @@ def filter_nodes(nodes):
        elif node.tag == 'text':
            continue
        else:
-            coords = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', ')))
+            coords = tuple(
+                map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}screencoord').strip('()').split(', ')))
            if coords[0] < 0 or coords[1] < 0:
                continue
-            size = tuple(map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', ')))
+            size = tuple(
+                map(int, node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size').strip('()').split(', ')))
            if size[0] <= 0 or size[1] <= 0:
                continue
            # Node is not a 'panel', add to the list.
@@ -57,6 +59,9 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
    # Load the screenshot image
    image = Image.open(image_file_path)
    draw = ImageDraw.Draw(image)
+    marks = []
+
+    # todo: change the image tagger to align with SoM paper

    # Optional: Load a font. If you don't specify a font, a default one will be used.
    try:
@@ -95,8 +100,26 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
                draw.text(text_position, str(index), font=font, fill="purple")

+                # each mark is an x, y, w, h tuple
+                marks.append([coords[0], coords[1], size[0], size[1]])
+
            except ValueError as e:
                pass

    # Save the result
    image.save(output_image_file_path)
+    return marks
+
+
+def print_nodes_with_indent(nodes, indent=0):
+    for node in nodes:
+        print(' ' * indent, node.tag, node.attrib)
+        print_nodes_with_indent(node, indent + 2)
+
+
+if __name__ == '__main__':
+    with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
+        xml_file_str = f.read()
+
+    nodes = ET.fromstring(xml_file_str)
+    print_nodes_with_indent(nodes)
--- a/mm_agents/gpt_4_agent.py
+++ b/mm_agents/gpt_4_agent.py
@@ -1,195 +0,0 @@
-import base64
-import json
-import re
-import time
-from typing import Dict, List
-
-import requests
-
-from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
-from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
-from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
-
-
-# Function to encode the image
-def encode_image(image_path):
-    with open(image_path, "rb") as image_file:
-        return base64.b64encode(image_file.read()).decode('utf-8')
-
-
-def parse_actions_from_string(input_string):
-    # Search for a JSON string within the input string
-    actions = []
-    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
-    if matches:
-        # Assuming there's only one match, parse the JSON string into a dictionary
-        try:
-            for match in matches:
-                action_dict = json.loads(match)
-                actions.append(action_dict)
-            return actions
-        except json.JSONDecodeError as e:
-            return f"Failed to parse JSON: {e}"
-    else:
-        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
-        if matches:
-            # Assuming there's only one match, parse the JSON string into a dictionary
-            try:
-                for match in matches:
-                    action_dict = json.loads(match)
-                    actions.append(action_dict)
-                return actions
-            except json.JSONDecodeError as e:
-                return f"Failed to parse JSON: {e}"
-        else:
-            try:
-                action_dict = json.loads(input_string)
-                return [action_dict]
-            except json.JSONDecodeError as e:
-                raise ValueError("Invalid response format: " + input_string)
-
-
-def parse_code_from_string(input_string):
-    # This regular expression will match both ```code``` and ```python code```
-    # and capture the `code` part. It uses a non-greedy match for the content inside.
-    pattern = r"```(?:\w+\s+)?(.*?)```"
-    # Find all non-overlapping matches in the string
-    matches = re.findall(pattern, input_string, re.DOTALL)
-
-    # The regex above captures the content inside the triple backticks.
-    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
-    # so the code inside backticks can span multiple lines.
-
-    # matches now contains all the captured code snippets
-
-    codes = []
-
-    for match in matches:
-        match = match.strip()
-        commands = ['WAIT', 'DONE', 'FAIL']  # fixme: updates this part when we have more commands
-
-        if match in commands:
-            codes.append(match.strip())
-        elif match.split('\n')[-1] in commands:
-            if len(match.split('\n')) > 1:
-                codes.append("\n".join(match.split('\n')[:-1]))
-            codes.append(match.split('\n')[-1])
-        else:
-            codes.append(match)
-
-    return codes
-
-
-class GPT4_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"):
-        self.instruction = instruction
-        self.model = model
-        self.max_tokens = max_tokens
-        self.action_space = action_space
-
-        self.headers = {
-            "Content-Type": "application/json",
-            "Authorization": f"Bearer {api_key}"
-        }
-
-        self.trajectory = [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": {
-                                    "computer_13": SYS_PROMPT_ACTION,
-                                    "pyautogui": SYS_PROMPT_CODE
-                                }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
-                    },
-                ]
-            }
-        ]
-
-    def predict(self, obs: Dict) -> List:
-        """
-        Predict the next action(s) based on the current observation.
-        """
-        accessibility_tree = obs["accessibility_tree"]
-
-        leaf_nodes = find_leaf_nodes(accessibility_tree)
-        filtered_nodes = filter_nodes(leaf_nodes)
-
-        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
-        # Linearize the accessibility tree nodes into a table format
-
-        for node in filtered_nodes:
-            linearized_accessibility_tree += node.tag + "\t"
-            linearized_accessibility_tree += node.attrib.get('name') + "\t"
-            linearized_accessibility_tree += node.attrib.get(
-                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
-            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
-
-        self.trajectory.append({
-            "role": "user",
-            "content": [
-                {
-                    "type": "text",
-                    "text": "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-                        linearized_accessibility_tree)
-                }
-            ]
-        })
-
-        # print(
-        #     "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
-        #         linearized_accessibility_tree)
-        # )
-
-        traj_to_show = []
-        for i in range(len(self.trajectory)):
-            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
-            if len(self.trajectory[i]["content"]) > 1:
-                traj_to_show.append("screenshot_obs")
-
-        payload = {
-            "model": self.model,
-            "messages": self.trajectory,
-            "max_tokens": self.max_tokens
-        }
-
-        while True:
-            try:
-                response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
-                                         json=payload)
-                break
-            except:
-                print("Failed to generate response, retrying...")
-                time.sleep(5)
-                pass
-
-        try:
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
-        except:
-            print("Failed to parse action from response:", response.json())
-            actions = None
-
-        return actions
-
-    def parse_actions(self, response: str):
-        # parse from the response
-        if self.action_space == "computer_13":
-            actions = parse_actions_from_string(response)
-        elif self.action_space == "pyautogui":
-            actions = parse_code_from_string(response)
-        else:
-            raise ValueError("Invalid action space: " + self.action_space)
-
-        # add action into the trajectory
-        self.trajectory.append({
-            "role": "assistant",
-            "content": [
-                {
-                    "type": "text",
-                    "text": response
-                },
-            ]
-        })
-
-        return actions
--- a/mm_agents/gpt_4_prompt_action.py
+++ b/mm_agents/gpt_4_prompt_action.py
@@ -1,244 +0,0 @@
-SYS_PROMPT = """
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
-
-HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
-ACTION_SPACE = [
-    {
-        "action_type": "MOVE_TO",
-        "note": "move the cursor to the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": False,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "CLICK",
-        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            },
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            },
-            "num_clicks": {
-                "type": int,
-                "range": [1, 2, 3],
-                "optional": True,
-            },
-        }
-    },
-    {
-        "action_type": "MOUSE_DOWN",
-        "note": "press the left button if the button not specified, otherwise press the specified button",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "MOUSE_UP",
-        "note": "release the left button if the button not specified, otherwise release the specified button",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "RIGHT_CLICK",
-        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "DOUBLE_CLICK",
-        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "DRAG_TO",
-        "note": "drag the cursor to the specified position with the left button pressed",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": False,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "SCROLL",
-        "note": "scroll the mouse wheel up or down",
-        "parameters": {
-            "dx": {
-                "type": int,
-                "range": None,
-                "optional": False,
-            },
-            "dy": {
-                "type": int,
-                "range": None,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "TYPING",
-        "note": "type the specified text",
-        "parameters": {
-            "text": {
-                "type": str,
-                "range": None,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "PRESS",
-        "note": "press the specified key and release it",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "KEY_DOWN",
-        "note": "press the specified key",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "KEY_UP",
-        "note": "release the specified key",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "HOTKEY",
-        "note": "press the specified key combination",
-        "parameters": {
-            "keys": {
-                "type": list,
-                "range": [KEYBOARD_KEYS],
-                "optional": False,
-            }
-        }
-    },
-    ############################################################################################################
-    {
-        "action_type": "WAIT",
-        "note": "wait until the next action",
-    },
-    {
-        "action_type": "FAIL",
-        "note": "decide the task can not be performed",
-    },
-    {
-        "action_type": "DONE",
-        "note": "decide the task is done",
-    }
-]
-Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
-for example, format as:
-```
-{
-  "action_type": "MOUSE_MOVE",
-  "x": 1319.11,
-  "y": 65.06
-}
-```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
-for example, format as:
-```
-{
-  "action_type": "CLICK",
-  "click_type": "LEFT"
-}
-```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
-for example, format as:
-```
-{
-  "action_type": "KEY",
-  "key": "ctrl+c"
-}
-```
- For TYPE, you need to specify the text you want to type
-for example, format as:
-```
-{
-  "action_type": "TYPE",
-  "text": "hello world"
-}
-```
-
-REMEMBER:
-For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
-You MUST wrap the dict with backticks (\`).
-You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
-You CAN predict multiple actions at one step, but you should only return one action for each step.
-"""
--- a/mm_agents/gpt_4_prompt_code.py
+++ b/mm_agents/gpt_4_prompt_code.py
@@ -1,18 +0,0 @@
-SYS_PROMPT = """
-You are an agent which follow my instruction and perform desktop computer tasks as instructed.
-You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
-
-You are required to use `pyautogui` to perform the action. 
-Return one line or multiple lines of python code to perform the action each time, be time efficient.
-You ONLY need to return the code inside a code block, like this:
-```python
-# your code here
-```
-Specially, it is also allowed to return the following special code:
-When you think you have to wait for some time, return ```WAIT```;
-When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
-When you think the task is done, return ```DONE```.
-
-First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
-"""
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,14 +1,27 @@
 import base64
 import json
+import os
 import re
 import time
+import uuid
 from typing import Dict, List

+import backoff
 import requests
+from openai.error import (
+    APIConnectionError,
+    APIError,
+    RateLimitError,
+    ServiceUnavailableError,
+    InvalidRequestError
+)

-from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
-from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
-from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes
+from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
+    SYS_PROMPT_IN_A11Y_OUT_CODE, SYS_PROMPT_IN_A11Y_OUT_ACTION, \
+    SYS_PROMPT_IN_BOTH_OUT_CODE, SYS_PROMPT_IN_BOTH_OUT_ACTION, \
+    SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
+    SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT


 # Function to encode the image
@@ -17,6 +30,35 @@ def encode_image(image_path):
        return base64.b64encode(image_file.read()).decode('utf-8')


+def linearize_accessibility_tree(accessibility_tree):
+    leaf_nodes = find_leaf_nodes(accessibility_tree)
+    filtered_nodes = filter_nodes(leaf_nodes)
+
+    linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
+    # Linearize the accessibility tree nodes into a table format
+
+    for node in filtered_nodes:
+        linearized_accessibility_tree += node.tag + "\t"
+        linearized_accessibility_tree += node.attrib.get('name') + "\t"
+        linearized_accessibility_tree += node.attrib.get(
+            '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
+        linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
+
+    return linearized_accessibility_tree
+
+
+def tag_screenshot(screenshot, accessibility_tree):
+    # Creat a tmp file to store the screenshot in random name
+    uuid_str = str(uuid.uuid4())
+    os.makedirs("tmp/images", exist_ok=True)
+    tagged_screenshot_file_path = os.path.join("tmp/images", uuid_str + ".png")
+    nodes = filter_nodes(find_leaf_nodes(accessibility_tree))
+    # Make tag screenshot
+    marks = draw_bounding_boxes(nodes, screenshot, tagged_screenshot_file_path)
+
+    return marks, tagged_screenshot_file_path
+
+
 def parse_actions_from_string(input_string):
    # Search for a JSON string within the input string
    actions = []
@@ -61,124 +103,295 @@ def parse_code_from_string(input_string):
    # so the code inside backticks can span multiple lines.

    # matches now contains all the captured code snippets
-    return matches
+
+    codes = []
+
+    for match in matches:
+        match = match.strip()
+        commands = ['WAIT', 'DONE', 'FAIL']  # fixme: updates this part when we have more commands
+
+        if match in commands:
+            codes.append(match.strip())
+        elif match.split('\n')[-1] in commands:
+            if len(match.split('\n')) > 1:
+                codes.append("\n".join(match.split('\n')[:-1]))
+            codes.append(match.split('\n')[-1])
+        else:
+            codes.append(match)
+
+    return codes
+
+
+def parse_code_from_som_string(input_string, masks):
+    for i, mask in enumerate(masks):
+        x, y, w, h = mask
+        input_string = input_string.replace("tag#" + str(i), "{}, {}".format(int(x + w // 2), int(y + h // 2)))
+
+    return parse_code_from_string(input_string)


 class GPT4v_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13", add_a11y_tree=False):
+    def __init__(
+            self,
+            api_key,
+            instruction,
+            model="gpt-4-vision-preview",
+            max_tokens=300,
+            action_space="computer_13",
+            exp="screenshot_a11y_tree"
+            # exp can be in ["screenshot", "a11y_tree", "screenshot_a11y_tree", "som", "seeact"]
+    ):
+
        self.instruction = instruction
        self.model = model
        self.max_tokens = max_tokens
        self.action_space = action_space
-        self.add_a11y_tree = add_a11y_tree
+        self.exp = exp

        self.headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {api_key}"
        }

-        self.trajectory = [
-            {
-                "role": "system",
-                "content": [
-                    {
-                        "type": "text",
-                        "text": {
-                                    "computer_13": SYS_PROMPT_ACTION,
-                                    "pyautogui": SYS_PROMPT_CODE
-                                }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
-                    },
-                ]
-            }
-        ]
+        self.actions = []
+        self.observations = []
+
+        if exp == "screenshot":
+            if action_space == "computer_13":
+                self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION
+            elif action_space == "pyautogui":
+                self.system_message = SYS_PROMPT_IN_SCREENSHOT_OUT_CODE
+            else:
+                raise ValueError("Invalid action space: " + action_space)
+        elif exp == "a11y_tree":
+            if action_space == "computer_13":
+                self.system_message = SYS_PROMPT_IN_A11Y_OUT_ACTION
+            elif action_space == "pyautogui":
+                self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE
+            else:
+                raise ValueError("Invalid action space: " + action_space)
+        elif exp == "both":
+            if action_space == "computer_13":
+                self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION
+            elif action_space == "pyautogui":
+                self.system_message = SYS_PROMPT_IN_BOTH_OUT_CODE
+            else:
+                raise ValueError("Invalid action space: " + action_space)
+        elif exp == "som":
+            if action_space == "computer_13":
+                raise ValueError("Invalid action space: " + action_space)
+            elif action_space == "pyautogui":
+                self.system_message = SYS_PROMPT_IN_SOM_A11Y_OUT_TAG
+            else:
+                raise ValueError("Invalid action space: " + action_space)
+        elif exp == "seeact":
+            if action_space == "computer_13":
+                raise ValueError("Invalid action space: " + action_space)
+            elif action_space == "pyautogui":
+                self.system_message = SYS_PROMPT_SEEACT
+            else:
+                raise ValueError("Invalid action space: " + action_space)
+        else:
+            raise ValueError("Invalid experiment type: " + exp)
+
+        self.system_message = (self.system_message +
+                               "\nHere is the instruction for the task: {}".format(self.instruction))

    def predict(self, obs: Dict) -> List:
        """
        Predict the next action(s) based on the current observation.
        """
-        base64_image = encode_image(obs["screenshot"])
-        accessibility_tree = obs["accessibility_tree"]

-        leaf_nodes = find_leaf_nodes(accessibility_tree)
-        filtered_nodes = filter_nodes(leaf_nodes)
+        # Prepare the payload for the API call
+        messages = []

-        linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
-        # Linearize the accessibility tree nodes into a table format
+        if len(self.actions) > 0:
+            system_message = self.system_message + "\nHere are the actions you have done so far:\n" + "\n->\n".join(
+                self.actions)
+        else:
+            system_message = self.system_message

-        for node in filtered_nodes:
-            linearized_accessibility_tree += node.tag + "\t"
-            linearized_accessibility_tree += node.attrib.get('name') + "\t"
-            linearized_accessibility_tree += node.attrib.get(
-                '{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
-            linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
-
-        self.trajectory.append({
-            "role": "user",
+        messages.append({
+            "role": "system",
            "content": [
                {
                    "type": "text",
-                    "text": "What's the next step that you will do to help with the task?" if not self.add_a11y_tree
-                    else "And given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(linearized_accessibility_tree)
+                    "text": system_message
                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": f"data:image/jpeg;base64,{base64_image}",
-                        "detail": "high"
-                    }
-                }
            ]
        })

-        traj_to_show = []
-        for i in range(len(self.trajectory)):
-            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
-            if len(self.trajectory[i]["content"]) > 1:
-                traj_to_show.append("screenshot_obs")
+        masks = None

-        print("Trajectory:", traj_to_show)
+        if self.exp in ["screenshot", "both"]:
+            base64_image = encode_image(obs["screenshot"])
+            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+            messages.append({
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Given the screenshot as below. What's the next step that you will do to help with the task?"
+                        if self.exp == "screenshot"
+                        else "Given the screenshot and info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                            linearized_accessibility_tree)
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "detail": "high"
+                        }
+                    }
+                ]
+            })
+        elif self.exp == "a11y_tree":
+            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+            messages.append({
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Given the info from accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                            linearized_accessibility_tree)
+                    }
+                ]
+            })
+        elif self.exp == "som":
+            # Add som to the screenshot
+            masks, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])

-        payload = {
+            base64_image = encode_image(tagged_screenshot)
+            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+
+            messages.append({
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "Given the info from the tagged screenshot as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
+                            linearized_accessibility_tree)
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "detail": "high"
+                        }
+                    }
+                ]
+            })
+        elif self.exp == "seeact":
+            # Add som to the screenshot
+            masks, tagged_screenshot = tag_screenshot(obs["screenshot"], obs["accessibility_tree"])
+
+            base64_image = encode_image(tagged_screenshot)
+            linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
+
+            messages.append({
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": ACTION_DESCRIPTION_PROMPT_SEEACT.format(linearized_accessibility_tree)
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": f"data:image/jpeg;base64,{base64_image}",
+                            "detail": "high"
+                        }
+                    }
+                ]
+            })
+        else:
+            raise ValueError("Invalid experiment type: " + self.exp)
+
+        response = self.call_llm({
            "model": self.model,
-            "messages": self.trajectory,
+            "messages": messages,
            "max_tokens": self.max_tokens
-        }
+        })

+        if self.exp == "seeact":
+            messages.append({
+                "role": "assistant",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": response
+                    }
+                ]
+            })
+
+            messages.append({
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": "{}\n\nWhat's the next step that you will do to help with the task?".format(
+                            ACTION_GROUNDING_PROMPT_SEEACT)
+                    }
+                ]
+            })
+
+            response = self.call_llm({
+                "model": self.model,
+                "messages": messages,
+                "max_tokens": self.max_tokens
+            })
+
+        try:
+            actions = self.parse_actions(response, masks)
+        except Exception as e:
+            print("Failed to parse action from response", e)
+            actions = None
+
+        return actions
+
+    @backoff.on_exception(
+        backoff.expo,
+        (APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError),
+    )
+    def call_llm(self, payload):
        while True:
            try:
-                response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers,
-                                         json=payload)
+                response = requests.post(
+                    "https://api.openai.com/v1/chat/completions",
+                    headers=self.headers,
+                    json=payload
+                )
                break
            except:
                print("Failed to generate response, retrying...")
                time.sleep(5)
                pass
-        try:
-            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
-        except:
-            print("Failed to parse action from response:", response.json())
-            actions = None

-        return actions
+        return response.json()['choices'][0]['message']['content']

-    def parse_actions(self, response: str):
-        # parse from the response
-        if self.action_space == "computer_13":
-            actions = parse_actions_from_string(response)
-        elif self.action_space == "pyautogui":
-            actions = parse_code_from_string(response)
-        else:
-            raise ValueError("Invalid action space: " + self.action_space)
+    def parse_actions(self, response: str, masks=None):

-        # add action into the trajectory
-        self.trajectory.append({
-            "role": "assistant",
-            "content": [
-                {
-                    "type": "text",
-                    "text": response
-                },
-            ]
-        })
+        if self.exp in ["screenshot", "a11y_tree", "both"]:
+            # parse from the response
+            if self.action_space == "computer_13":
+                actions = parse_actions_from_string(response)
+            elif self.action_space == "pyautogui":
+                actions = parse_code_from_string(response)
+            else:
+                raise ValueError("Invalid action space: " + self.action_space)

-        return actions
+            self.actions.append(actions)
+
+            return actions
+        elif self.exp in ["som", "seeact"]:
+            # parse from the response
+            if self.action_space == "computer_13":
+                raise ValueError("Invalid action space: " + self.action_space)
+            elif self.action_space == "pyautogui":
+                actions = parse_code_from_som_string(response, masks)
+            else:
+                raise ValueError("Invalid action space: " + self.action_space)
+
+            self.actions.append(actions)
+
+            return actions
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -1,244 +0,0 @@
-SYS_PROMPT = """
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-
-HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
-ACTION_SPACE = [
-    {
-        "action_type": "MOVE_TO",
-        "note": "move the cursor to the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": False,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "CLICK",
-        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            },
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            },
-            "num_clicks": {
-                "type": int,
-                "range": [1, 2, 3],
-                "optional": True,
-            },
-        }
-    },
-    {
-        "action_type": "MOUSE_DOWN",
-        "note": "press the left button if the button not specified, otherwise press the specified button",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "MOUSE_UP",
-        "note": "release the left button if the button not specified, otherwise release the specified button",
-        "parameters": {
-            "button": {
-                "type": str,
-                "range": ["left", "right", "middle"],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "RIGHT_CLICK",
-        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "DOUBLE_CLICK",
-        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": True,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": True,
-            }
-        }
-    },
-    {
-        "action_type": "DRAG_TO",
-        "note": "drag the cursor to the specified position with the left button pressed",
-        "parameters": {
-            "x": {
-                "type": float,
-                "range": [0, X_MAX],
-                "optional": False,
-            },
-            "y": {
-                "type": float,
-                "range": [0, Y_MAX],
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "SCROLL",
-        "note": "scroll the mouse wheel up or down",
-        "parameters": {
-            "dx": {
-                "type": int,
-                "range": None,
-                "optional": False,
-            },
-            "dy": {
-                "type": int,
-                "range": None,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "TYPING",
-        "note": "type the specified text",
-        "parameters": {
-            "text": {
-                "type": str,
-                "range": None,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "PRESS",
-        "note": "press the specified key and release it",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "KEY_DOWN",
-        "note": "press the specified key",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "KEY_UP",
-        "note": "release the specified key",
-        "parameters": {
-            "key": {
-                "type": str,
-                "range": KEYBOARD_KEYS,
-                "optional": False,
-            }
-        }
-    },
-    {
-        "action_type": "HOTKEY",
-        "note": "press the specified key combination",
-        "parameters": {
-            "keys": {
-                "type": list,
-                "range": [KEYBOARD_KEYS],
-                "optional": False,
-            }
-        }
-    },
-    ############################################################################################################
-    {
-        "action_type": "WAIT",
-        "note": "wait until the next action",
-    },
-    {
-        "action_type": "FAIL",
-        "note": "decide the task can not be performed",
-    },
-    {
-        "action_type": "DONE",
-        "note": "decide the task is done",
-    }
-]
-Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
-for example, format as:
-```
-{
-  "action_type": "MOUSE_MOVE",
-  "x": 1319.11,
-  "y": 65.06
-}
-```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
-for example, format as:
-```
-{
-  "action_type": "CLICK",
-  "click_type": "LEFT"
-}
-```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
-for example, format as:
-```
-{
-  "action_type": "KEY",
-  "key": "ctrl+c"
-}
-```
- For TYPE, you need to specify the text you want to type
-for example, format as:
-```
-{
-  "action_type": "TYPE",
-  "text": "hello world"
-}
-```
-
-REMEMBER:
-For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
-You MUST wrap the dict with backticks (\`).
-You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
-You CAN predict multiple actions at one step, but you should only return one action for each step.
-"""
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -1,18 +0,0 @@
-SYS_PROMPT = """
-You are an agent which follow my instruction and perform desktop computer tasks as instructed.
-You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
-
-You are required to use `pyautogui` to perform the action. 
-Return one line or multiple lines of python code to perform the action each time, be time efficient.
-You ONLY need to return the code inside a code block, like this:
-```python
-# your code here
-```
-Specially, it is also allowed to return the following special code:
-When you think you have to wait for some time, return ```WAIT```;
-When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
-When you think the task is done, return ```DONE```.
-
-First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
-"""
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -0,0 +1,862 @@
+SYS_PROMPT_IN_SCREENSHOT_OUT_CODE = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+""".strip()
+
+SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
+
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+REMEMBER:
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
+""".strip()
+
+SYS_PROMPT_IN_A11Y_OUT_CODE = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of the desktop by accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+""".strip()
+
+SYS_PROMPT_IN_A11Y_OUT_ACTION = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of the desktop by accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
+
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+REMEMBER:
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
+""".strip()
+
+SYS_PROMPT_IN_BOTH_OUT_CODE = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. 
+And you will predict the action of the computer based on the screenshot and accessibility tree.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+""".strip()
+
+SYS_PROMPT_IN_BOTH_OUT_ACTION = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. 
+And you will predict the action of the computer based on the screenshot and accessibility tree.
+
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+REMEMBER:
+For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
+""".strip()
+
+SYS_PROMPT_IN_SOM_A11Y_OUT_TAG = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. 
+
+You are required to use `pyautogui` to perform the action. But replace x, y in the code with the tag of the element you want to operate with. such as:
+```python
+pyautogui.moveTo(tag#3)
+pyautogui.click(tag#2)
+pyautogui.dragTo(tag#1, button='left')
+```
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+""".strip()
+
+SYS_PROMPT_SEEACT = """
+You are an agent which follow my instruction and perform desktop computer tasks as instructed.
+You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
+""".strip()
+
+ACTION_DESCRIPTION_PROMPT_SEEACT = """
+The text and image shown below is the observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library. 
+{}
+
+Follow the following guidance to think step by step before outlining the next action step at the current stage:
+
+(Current Screenshot Identification)
+Firstly, think about what the current screenshot is.
+
+(Previous Action Analysis)
+Secondly, combined with the screenshot, analyze each step of the previous action history and their intention one by one. Particularly, pay more attention to the last step, which may be more related to what you should do now as the next step.
+
+(Screenshot Details Analysis)
+Closely examine the screenshot to check the status of every part of the webpage to understand what you can operate with and what has been set or completed. You should closely examine the screenshot details to see what steps have been completed by previous actions even though you are given the textual previous actions. Because the textual history may not clearly and sufficiently record some effects of previous actions, you should closely evaluate the status of every part of the webpage to understand what you have done.
+
+(Next Action Based on Screenshot and Analysis)
+Then, based on your analysis, in conjunction with human desktop using habits and the logic of app GUI design, decide on the following action. And clearly outline which button in the screenshot users will operate with as the first next target element, its detailed location, and the corresponding operation.
+"""
+
+ACTION_GROUNDING_PROMPT_SEEACT = """
+You are required to use `pyautogui` to perform the action. But replace x, y in the code with the tag of the element you want to operate with. such as:
+```python
+pyautogui.moveTo(tag#3)
+pyautogui.click(tag#2)
+pyautogui.dragTo(tag#1, button='left')
+```
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+You ONLY need to return the code inside a code block, like this:
+```python
+# your code here
+```
+Specially, it is also allowed to return the following special code:
+When you think you have to wait for some time, return ```WAIT```;
+When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
+When you think the task is done, return ```DONE```.
+
+First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
+"""
--- a/mm_agents/sam_test.py
+++ b/mm_agents/sam_test.py
@@ -1,124 +0,0 @@
-import torch
-from PIL import Image
-import requests
-from transformers import SamModel, SamProcessor
-import numpy as np
-import matplotlib.pyplot as plt
-import os
-os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
-
-def show_mask(mask, ax, random_color=False):
-    if random_color:
-        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
-    else:
-        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
-    h, w = mask.shape[-2:]
-    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
-    ax.imshow(mask_image)
-
-
-def show_box(box, ax):
-    x0, y0 = box[0], box[1]
-    w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
-
-
-def show_boxes_on_image(raw_image, boxes):
-    plt.figure(figsize=(10, 10))
-    plt.imshow(raw_image)
-    for box in boxes:
-        show_box(box, plt.gca())
-    plt.axis('on')
-    plt.show()
-
-
-def show_points_on_image(raw_image, input_points, input_labels=None):
-    plt.figure(figsize=(10, 10))
-    plt.imshow(raw_image)
-    input_points = np.array(input_points)
-    if input_labels is None:
-        labels = np.ones_like(input_points[:, 0])
-    else:
-        labels = np.array(input_labels)
-    show_points(input_points, labels, plt.gca())
-    plt.axis('on')
-    plt.show()
-
-
-def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
-    plt.figure(figsize=(10, 10))
-    plt.imshow(raw_image)
-    input_points = np.array(input_points)
-    if input_labels is None:
-        labels = np.ones_like(input_points[:, 0])
-    else:
-        labels = np.array(input_labels)
-    show_points(input_points, labels, plt.gca())
-    for box in boxes:
-        show_box(box, plt.gca())
-    plt.axis('on')
-    plt.show()
-
-
-def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
-    plt.figure(figsize=(10, 10))
-    plt.imshow(raw_image)
-    input_points = np.array(input_points)
-    if input_labels is None:
-        labels = np.ones_like(input_points[:, 0])
-    else:
-        labels = np.array(input_labels)
-    show_points(input_points, labels, plt.gca())
-    for box in boxes:
-        show_box(box, plt.gca())
-    plt.axis('on')
-    plt.show()
-
-
-def show_points(coords, labels, ax, marker_size=375):
-    pos_points = coords[labels == 1]
-    neg_points = coords[labels == 0]
-    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
-               linewidth=1.25)
-    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
-               linewidth=1.25)
-
-
-def show_masks_on_image(raw_image, masks, scores):
-    if len(masks.shape) == 4:
-        masks = masks.squeeze()
-    if scores.shape[0] == 1:
-        scores = scores.squeeze()
-
-    nb_predictions = scores.shape[-1]
-    fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
-
-    for i, (mask, score) in enumerate(zip(masks, scores)):
-        mask = mask.cpu().detach()
-        axes[i].imshow(np.array(raw_image))
-        show_mask(mask, axes[i])
-        axes[i].title.set_text(f"Mask {i + 1}, Score: {score.item():.3f}")
-        axes[i].axis("off")
-    plt.show()
-
-
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
-processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
-
-img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
-raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
-
-plt.imshow(raw_image)
-
-inputs = processor(raw_image, return_tensors="pt").to(device)
-with torch.no_grad():
-    outputs = model(**inputs)
-
-masks = processor.image_processor.post_process_masks(
-    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
-)
-
-
-scores = outputs.iou_scores
-show_masks_on_image(raw_image, masks[0], scores)
--- a/mm_agents/visualizer.py
+++ b/mm_agents/visualizer.py
--- a/requirements.txt
+++ b/requirements.txt
@@ -32,3 +32,4 @@ librosa
 pymupdf
 chardet
 playwright
+backoff