Add 'WAIT', 'FAIL', 'DONE' to the action space; Debug basic prompting-based GPT-4 and Gemini agents; Initialize experiments script;

2024-01-14 23:36:19 +08:00
parent d52b692ee5
commit f153a4c253
10 changed files with 482 additions and 144 deletions
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_agent.py
@@ -0,0 +1,84 @@
+from typing import Dict
+
+import PIL.Image
+import google.generativeai as genai
+
+from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
+from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+
+
+class GeminiPro_Agent:
+    def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
+        genai.configure(api_key)
+        self.model = genai.GenerativeModel(model)
+        self.max_tokens = max_tokens
+        self.action_space = action_space
+
+        self.trajectory = [
+            {
+                "role": "system",
+                "parts": [
+                    {
+                        "computer_13": SYS_PROMPT_ACTION,
+                        "pyautogui": SYS_PROMPT_CODE
+                    }[action_space]
+                ]
+            }
+        ]
+
+    def predict(self, obs: Dict):
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        img = PIL.Image.open(obs["screenshot"])
+        self.trajectory.append({
+            "role": "user",
+            "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
+                obs["instruction"]), img]
+        })
+
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["parts"][0])
+            if len(self.trajectory[i]["parts"]) > 1:
+                traj_to_show.append("screenshot_obs")
+
+        print("Trajectory:", traj_to_show)
+
+        response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
+
+        try:
+            # fixme: change to fit the new response format from gemini pro
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+        except:
+            # todo: add error handling
+            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            actions = None
+
+        return actions
+
+    def parse_actions(self, response: str):
+        # response example
+        """
+        ```json
+        {
+          "action_type": "CLICK",
+          "click_type": "RIGHT"
+        }
+        ```
+        """
+
+        # parse from the response
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)
+
+        # add action into the trajectory
+        self.trajectory.append({
+            "role": "assistant",
+            "parts": [response]
+        })
+
+        return actions
--- a/mm_agents/gemini_test.py
+++ b/mm_agents/gemini_test.py
@@ -1,19 +0,0 @@
-import PIL.Image
-import google.generativeai as genai
-
-genai.configure(api_key="AIzaSyANsETKHVo-D8jZu1SnTSaQgLOJEDgnj9Q")
-
-# for m in genai.list_models():
-#   if 'generateContent' in m.supported_generation_methods:
-#     print(m.name)
-
-model = genai.GenerativeModel('gemini-pro-vision')
-
-img = PIL.Image.open('image.jpg')
-
-messages = [
-    {'role':'user',
-     'parts': ["Explain this image.", img]}
-]
-
-response = model.generate_content(messages)
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,12 +1,12 @@
-# fixme: Need to be rewrite on new action space
-
-import os
-import re
 import base64
-from desktop_env.envs.desktop_env import Action, MouseClick
 import json
+import re
+from typing import Dict
+
 import requests
-from mm_agents.gpt_4v_prompt import SYS_PROMPT
+
+from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE


 # Function to encode the image
@@ -47,11 +47,26 @@ def parse_actions_from_string(input_string):
                raise ValueError("Invalid response format: " + input_string)


+def parse_code_from_string(input_string):
+    # This regular expression will match both ```code``` and ```python code```
+    # and capture the `code` part. It uses a non-greedy match for the content inside.
+    pattern = r"```(?:\w+\s+)?(.*?)```"
+    # Find all non-overlapping matches in the string
+    matches = re.findall(pattern, input_string, re.DOTALL)
+
+    # The regex above captures the content inside the triple backticks.
+    # The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
+    # so the code inside backticks can span multiple lines.
+
+    # matches now contains all the captured code snippets
+    return matches
+
+
 class GPT4v_Agent:
-    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
-        self.instruction = instruction
+    def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
        self.model = model
        self.max_tokens = max_tokens
+        self.action_space = action_space

        self.headers = {
            "Content-Type": "application/json",
@@ -64,20 +79,27 @@ class GPT4v_Agent:
                "content": [
                    {
                        "type": "text",
-                        "text": SYS_PROMPT
+                        "text": {
+                            "computer_13": SYS_PROMPT_ACTION,
+                            "pyautogui": SYS_PROMPT_CODE
+                        }[action_space]
                    },
                ]
            }
        ]

-    def predict(self, obs):
-        base64_image = encode_image(obs)
+    def predict(self, obs: Dict):
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        base64_image = encode_image(obs["screenshot"])
        self.trajectory.append({
            "role": "user",
            "content": [
                {
                    "type": "text",
-                    "text": "What's the next step for instruction '{}'?".format(self.instruction)
+                    "text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
+                        obs["instruction"])
                },
                {
                    "type": "image_url",
@@ -87,12 +109,15 @@ class GPT4v_Agent:
                }
            ]
        })
+
        traj_to_show = []
        for i in range(len(self.trajectory)):
            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
            if len(self.trajectory[i]["content"]) > 1:
                traj_to_show.append("screenshot_obs")
+
        print("Trajectory:", traj_to_show)
+
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -103,6 +128,7 @@ class GPT4v_Agent:
        try:
            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
        except:
+            # todo: add error handling
            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
            actions = None

@@ -120,7 +146,10 @@ class GPT4v_Agent:
        """

        # parse from the response
-        actions = parse_actions_from_string(response)
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
@@ -133,34 +162,4 @@ class GPT4v_Agent:
            ]
        })

-        # parse action
-        parsed_actions = []
-        for action in actions:
-            parsed_action = {}
-            action_type = Action[action['action_type']].value
-            parsed_action["action_type"] = action_type
-
-            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-                parsed_action["click_type"] = MouseClick[action['click_type']].value
-
-            if action_type == Action.MOUSE_MOVE.value:
-                parsed_action["x"] = action["x"]
-                parsed_action["y"] = action["y"]
-
-            if action_type == Action.KEY.value:
-                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys
-
-            if action_type == Action.TYPE.value:
-                parsed_action["text"] = action["text"]
-
-            parsed_actions.append(parsed_action)
-
-        return parsed_actions
-
-
-if __name__ == '__main__':
-    # OpenAI API Key
-    api_key = os.environ.get("OPENAI_API_KEY")
-
-    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
-    print(agent.predict(obs="stackoverflow.png"))
+        return actions
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.txt
@@ -1,52 +0,0 @@
-You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
-For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-Here is the description of the action space:
-
-Firstly you need to predict the class of your action, select from one below:
- **MOUSE_MOVE**: move the mouse to a specific position
- **CLICK**: click on the screen
- **MOUSE_DOWN**: press the mouse button
- **MOUSE_UP**: release the mouse button
- **KEY**: press a key on the keyboard
- **KEY_DOWN**: press a key on the keyboard
- **KEY_UP**: release a key on the keyboard
- **TYPE**: type a string on the keyboard
-
-Then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
-for example, format as:
-```
-{
-  "action_type": "MOUSE_MOVE",
-  "x": 1319.11,
-  "y": 65.06
-}
-```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
-for example, format as:
-```
-{
-  "action_type": "CLICK",
-  "click_type": "LEFT"
-}
-```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
-for example, format as:
-```
-{
-  "action_type": "TYPE",
-  "text": [
-    "w",
-    "i",
-    "k",
-    "i",
-    "p",
-    "e",
-    "d",
-    "i",
-    "a"
-  ]
-}
-```
-
-For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -1,19 +1,207 @@
 SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
-Here is the description of the action space:

-Firstly you need to predict the class of your action, select from one below:
- **MOUSE_MOVE**: move the mouse to a specific position
- **CLICK**: click on the screen
- **MOUSE_DOWN**: press the mouse button
- **MOUSE_UP**: release the mouse button
- **KEY**: press a key on the keyboard
- **KEY_DOWN**: press a key on the keyboard
- **KEY_UP**: release a key on the keyboard
- **TYPE**: type a string on the keyboard
-
-Then you need to predict the parameters of your action:
+HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
+ACTION_SPACE = [
+    {
+        "action_type": "MOVE_TO",
+        "note": "move the cursor to the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "CLICK",
+        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            },
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            },
+            "num_clicks": {
+                "type": int,
+                "range": [1, 2, 3],
+                "optional": True,
+            },
+        }
+    },
+    {
+        "action_type": "MOUSE_DOWN",
+        "note": "press the left button if the button not specified, otherwise press the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "MOUSE_UP",
+        "note": "release the left button if the button not specified, otherwise release the specified button",
+        "parameters": {
+            "button": {
+                "type": str,
+                "range": ["left", "right", "middle"],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "RIGHT_CLICK",
+        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DOUBLE_CLICK",
+        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": True,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": True,
+            }
+        }
+    },
+    {
+        "action_type": "DRAG_TO",
+        "note": "drag the cursor to the specified position with the left button pressed",
+        "parameters": {
+            "x": {
+                "type": float,
+                "range": [0, X_MAX],
+                "optional": False,
+            },
+            "y": {
+                "type": float,
+                "range": [0, Y_MAX],
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "SCROLL",
+        "note": "scroll the mouse wheel up or down",
+        "parameters": {
+            "dx": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            },
+            "dy": {
+                "type": int,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "TYPING",
+        "note": "type the specified text",
+        "parameters": {
+            "text": {
+                "type": str,
+                "range": None,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "PRESS",
+        "note": "press the specified key and release it",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_DOWN",
+        "note": "press the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "KEY_UP",
+        "note": "release the specified key",
+        "parameters": {
+            "key": {
+                "type": str,
+                "range": KEYBOARD_KEYS,
+                "optional": False,
+            }
+        }
+    },
+    {
+        "action_type": "HOTKEY",
+        "note": "press the specified key combination",
+        "parameters": {
+            "keys": {
+                "type": list,
+                "range": [KEYBOARD_KEYS],
+                "optional": False,
+            }
+        }
+    },
+    ############################################################################################################
+    {
+        "action_type": "WAIT",
+        "note": "wait until the next action",
+    },
+    {
+        "action_type": "FAIL",
+        "note": "decide the task can not be performed",
+    },
+    {
+        "action_type": "DONE",
+        "note": "decide the task is done",
+    }
+]
+Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
 - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
@@ -48,7 +236,9 @@ for example, format as:
 }
 ```

-For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
-You can predict multiple actions at one step, but you should only return one action for each step.
+REMEMBER:
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. 
+You MUST wrap the dict with backticks (\`).
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+You CAN predict multiple actions at one step, but you should only return one action for each step.
 """
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -4,5 +4,8 @@ For each step, you will get an observation of an image, which is the screenshot

 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
-Return `None` if you cannot perform the action.
+
+When you think you have to wait for some time, return `WAIT`.
+When you think the task can not be done, return `FAIL`.
+When you think the task is done, return `DONE`.
 """