Add 'WAIT', 'FAIL', 'DONE' to the action space; Debug basic prompting-based GPT-4 and Gemini agents; Initialize experiments script;

2024-01-14 23:36:19 +08:00
parent d52b692ee5
commit f153a4c253
10 changed files with 482 additions and 144 deletions
--- a/mm_agents/gemini_agent.py
+++ b/mm_agents/gemini_agent.py
@@ -0,0 +1,84 @@
+from typing import Dict
+
+import PIL.Image
+import google.generativeai as genai
+
+from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
+from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
+from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
+
+
+class GeminiPro_Agent:
+    def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"):
+        genai.configure(api_key)
+        self.model = genai.GenerativeModel(model)
+        self.max_tokens = max_tokens
+        self.action_space = action_space
+
+        self.trajectory = [
+            {
+                "role": "system",
+                "parts": [
+                    {
+                        "computer_13": SYS_PROMPT_ACTION,
+                        "pyautogui": SYS_PROMPT_CODE
+                    }[action_space]
+                ]
+            }
+        ]
+
+    def predict(self, obs: Dict):
+        """
+        Predict the next action(s) based on the current observation.
+        """
+        img = PIL.Image.open(obs["screenshot"])
+        self.trajectory.append({
+            "role": "user",
+            "parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
+                obs["instruction"]), img]
+        })
+
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["parts"][0])
+            if len(self.trajectory[i]["parts"]) > 1:
+                traj_to_show.append("screenshot_obs")
+
+        print("Trajectory:", traj_to_show)
+
+        response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens)
+
+        try:
+            # fixme: change to fit the new response format from gemini pro
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+        except:
+            # todo: add error handling
+            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            actions = None
+
+        return actions
+
+    def parse_actions(self, response: str):
+        # response example
+        """
+        ```json
+        {
+          "action_type": "CLICK",
+          "click_type": "RIGHT"
+        }
+        ```
+        """
+
+        # parse from the response
+        if self.action_space == "computer_13":
+            actions = parse_actions_from_string(response)
+        elif self.action_space == "pyautogui":
+            actions = parse_code_from_string(response)
+
+        # add action into the trajectory
+        self.trajectory.append({
+            "role": "assistant",
+            "parts": [response]
+        })
+
+        return actions