Fix the width and height of vm, make agent perform more accurate

2023-11-30 12:10:41 +08:00
parent ecb62d7eb4
commit e52ba2ab13
4 changed files with 134 additions and 62 deletions
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -6,20 +6,24 @@ import json
 import requests
 from mm_agents.gpt_4v_prompt import SYS_PROMPT

+
 # Function to encode the image
 def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


-def parse_action_from_string(input_string):
+def parse_actions_from_string(input_string):
    # Search for a JSON string within the input string
+    actions = []
    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
    if matches:
        # Assuming there's only one match, parse the JSON string into a dictionary
        try:
-            action_dict = json.loads(matches[0])
-            return action_dict
+            for match in matches:
+                action_dict = json.loads(match)
+                actions.append(action_dict)
+            return actions
        except json.JSONDecodeError as e:
            return f"Failed to parse JSON: {e}"
    else:
@@ -27,17 +31,20 @@ def parse_action_from_string(input_string):
        if matches:
            # Assuming there's only one match, parse the JSON string into a dictionary
            try:
-                action_dict = json.loads(matches[0])
-                return action_dict
+                for match in matches:
+                    action_dict = json.loads(match)
+                    actions.append(action_dict)
+                return actions
            except json.JSONDecodeError as e:
                return f"Failed to parse JSON: {e}"
        else:
            try:
                action_dict = json.loads(input_string)
-                return action_dict
+                return [action_dict]
            except json.JSONDecodeError as e:
                raise ValueError("Invalid response format: " + input_string)

+
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -78,6 +85,10 @@ class GPT4v_Agent:
                }
            ]
        })
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
+        print("Trajectory:", traj_to_show)
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -85,11 +96,15 @@ class GPT4v_Agent:
        }
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)

-        action = self.parse_action(response.json()['choices'][0]['message']['content'])
+        try:
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+        except:
+            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            actions = None

-        return action
+        return actions

-    def parse_action(self, response: str):
+    def parse_actions(self, response: str):
        # response example
        """
        ```json
@@ -101,7 +116,7 @@ class GPT4v_Agent:
        """

        # parse from the response
-        action = parse_action_from_string(response)
+        actions = parse_actions_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
@@ -115,25 +130,28 @@ class GPT4v_Agent:
        })

        # parse action
-        parsed_action = {}
-        action_type = Action[action['action_type']].value
-        parsed_action["action_type"] = action_type
+        parsed_actions = []
+        for action in actions:
+            parsed_action = {}
+            action_type = Action[action['action_type']].value
+            parsed_action["action_type"] = action_type

-        if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-            parsed_action["click_type"] = MouseClick[action['click_type']].value
+            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
+                parsed_action["click_type"] = MouseClick[action['click_type']].value

-        if action_type == Action.MOUSE_MOVE.value:
-            parsed_action["x"] = action["x"]
-            parsed_action["y"] = action["y"]
+            if action_type == Action.MOUSE_MOVE.value:
+                parsed_action["x"] = action["x"]
+                parsed_action["y"] = action["y"]

-        # fixme: could these two actions be merged??
-        if action_type == Action.KEY.value:
-            parsed_action["key"] = [ord(c) for c in action["key"]]
+            if action_type == Action.KEY.value:
+                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys

-        if action_type == Action.TYPE.value:
-            parsed_action["text"] = [ord(c) for c in action["text"]]
+            if action_type == Action.TYPE.value:
+                parsed_action["text"] = action["text"]

-        return parsed_action
+            parsed_actions.append(parsed_action)
+
+        return parsed_actions


 if __name__ == '__main__':
@@ -142,4 +160,3 @@ if __name__ == '__main__':

    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
    print(agent.predict(obs="stackoverflow.png"))
-