Run through gpt_4v agent pipeline

2023-11-29 20:21:57 +08:00
parent 28c6edd6b3
commit 3d0d9d7758
8 changed files with 135 additions and 47 deletions
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,9 +1,10 @@
 import os
+import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
 import requests
-
+from mm_agents.gpt_4v_prompt import SYS_PROMPT

 # Function to encode the image
 def encode_image(image_path):
@@ -11,6 +12,32 @@ def encode_image(image_path):
        return base64.b64encode(image_file.read()).decode('utf-8')


+def parse_action_from_string(input_string):
+    # Search for a JSON string within the input string
+    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+    if matches:
+        # Assuming there's only one match, parse the JSON string into a dictionary
+        try:
+            action_dict = json.loads(matches[0])
+            return action_dict
+        except json.JSONDecodeError as e:
+            return f"Failed to parse JSON: {e}"
+    else:
+        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+        if matches:
+            # Assuming there's only one match, parse the JSON string into a dictionary
+            try:
+                action_dict = json.loads(matches[0])
+                return action_dict
+            except json.JSONDecodeError as e:
+                return f"Failed to parse JSON: {e}"
+        else:
+            try:
+                action_dict = json.loads(input_string)
+                return action_dict
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid response format: " + input_string)
+
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -22,18 +49,13 @@ class GPT4v_Agent:
            "Authorization": f"Bearer {api_key}"
        }

-        # load prompt from file
-        self.prompt = ""
-        with open("gpt_4v_prompt.txt", "r") as f:
-            self.prompt = f.read()
-
        self.trajectory = [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
-                        "text": self.prompt
+                        "text": SYS_PROMPT
                    },
                ]
            }
@@ -79,12 +101,7 @@ class GPT4v_Agent:
        """

        # parse from the response
-        if response.startswith("```json"):
-            action = json5.loads(response[7:-3])
-        elif response.startswith("```"):
-            action = json5.loads(response[3:-3])
-        else:
-            action = json5.loads(response)
+        action = parse_action_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.txt
@@ -1,3 +1,4 @@
+SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
 - **TYPE**: type a string on the keyboard

 Then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
 {
@@ -30,7 +31,7 @@ for example, format as:
  "click_type": "LEFT"
 }
 ```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
+- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
 for example, format as:
 ```
 {
@@ -49,4 +50,6 @@ for example, format as:
 }
 ```

-For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+"""