Add 'WAIT', 'FAIL', 'DONE' to the action space; Debug basic prompting-based GPT-4 and Gemini agents; Initialize experiments script;

2024-01-14 23:36:19 +08:00
parent d52b692ee5
commit f153a4c253
10 changed files with 482 additions and 144 deletions
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,12 +1,12 @@
-# fixme: Need to be rewrite on new action space
-
-import os
-import re
 import base64
-from desktop_env.envs.desktop_env import Action, MouseClick
 import json
+import re
+from typing import Dict
+
 import requests
-from mm_agents.gpt_4v_prompt import SYS_PROMPT
+
+from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE


 # Function to encode the image
@@ -47,11 +47,26 @@ def parse_actions_from_string(input_string):
                raise ValueError("Invalid response format: " + input_string)


+def parse_code_from_string(input_string):
+    # This regular expression will match both ```code``` and ```python code```
+    # and capture the `code` part. It uses a non-greedy match for the content inside.
+    pattern = r"```(?:\w+\s+)?(.*?)```"
+    # Find all non-overlapping matches in the string
+    matches = re.findall(pattern, input_string, re.DOTALL)
+
+    # The regex above captures the content inside the triple backticks.
+    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
+    # so the code inside backticks can span multiple lines.
+
+    # matches now contains all the captured code snippets
+    return matches
+
+
 class GPT4v_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
-        self.instruction = instruction
+    def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
        self.model = model
        self.max_tokens = max_tokens
+        self.action_space = action_space

        self.headers = {
            "Content-Type": "application/json",
@@ -64,20 +79,27 @@ class GPT4v_Agent:
                "content": [
                    {
                        "type": "text",
-                        "text": SYS_PROMPT
+                        "text": {
+                            "computer_13": SYS_PROMPT_ACTION,
+                            "pyautogui": SYS_PROMPT_CODE
+                        }[action_space]
                    },
                ]
            }
        ]

-    def predict(self, obs):
-        base64_image = encode_image(obs)
+    def predict(self, obs: Dict):
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        base64_image = encode_image(obs["screenshot"])
        self.trajectory.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
-                    "text": "What's the next step for instruction '{}'?".format(self.instruction)
+                    "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
+                        obs["instruction"])
                },
                {
                    "type": "image_url",
@@ -87,12 +109,15 @@ class GPT4v_Agent:
                }
            ]
        })
+
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
            if len(self.trajectory[i]["content"]) > 1:
                traj_to_show.append("screenshot_obs")
+
        print("Trajectory:", traj_to_show)
+
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -103,6 +128,7 @@ class GPT4v_Agent:
        try:
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
        except:
+            # todo: add error handling
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None

@@ -120,7 +146,10 @@ class GPT4v_Agent:
        """

        # parse from the response
-        actions = parse_actions_from_string(response)
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
@@ -133,34 +162,4 @@ class GPT4v_Agent:
            ]
        })

-        # parse action
-        parsed_actions = []
-        for action in actions:
-            parsed_action = {}
-            action_type = Action[action['action_type']].value
-            parsed_action["action_type"] = action_type
-
-            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-                parsed_action["click_type"] = MouseClick[action['click_type']].value
-
-            if action_type == Action.MOUSE_MOVE.value:
-                parsed_action["x"] = action["x"]
-                parsed_action["y"] = action["y"]
-
-            if action_type == Action.KEY.value:
-                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys
-
-            if action_type == Action.TYPE.value:
-                parsed_action["text"] = action["text"]
-
-            parsed_actions.append(parsed_action)
-
-        return parsed_actions
-
-
-if __name__ == '__main__':
-    # OpenAI API Key
-    api_key = os.environ.get("OPENAI_API_KEY")
-
-    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
-    print(agent.predict(obs="stackoverflow.png"))
+        return actions