Fix conflicts

2023-12-16 21:32:43 +08:00
parent 7ab3799d30 fe2e5332a7
commit 30064ff816
43 changed files with 4124 additions and 631 deletions
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,8 +1,12 @@
+# fixme: Need to be rewrite on new action space
+
 import os
+import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
 import requests
+from mm_agents.gpt_4v_prompt import SYS_PROMPT


 # Function to encode the image
@@ -11,6 +15,38 @@ def encode_image(image_path):
        return base64.b64encode(image_file.read()).decode('utf-8')


+def parse_actions_from_string(input_string):
+    # Search for a JSON string within the input string
+    actions = []
+    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+    if matches:
+        # Assuming there's only one match, parse the JSON string into a dictionary
+        try:
+            for match in matches:
+                action_dict = json.loads(match)
+                actions.append(action_dict)
+            return actions
+        except json.JSONDecodeError as e:
+            return f"Failed to parse JSON: {e}"
+    else:
+        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+        if matches:
+            # Assuming there's only one match, parse the JSON string into a dictionary
+            try:
+                for match in matches:
+                    action_dict = json.loads(match)
+                    actions.append(action_dict)
+                return actions
+            except json.JSONDecodeError as e:
+                return f"Failed to parse JSON: {e}"
+        else:
+            try:
+                action_dict = json.loads(input_string)
+                return [action_dict]
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid response format: " + input_string)
+
+
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -22,18 +58,13 @@ class GPT4v_Agent:
            "Authorization": f"Bearer {api_key}"
        }

-        # load prompt from file
-        self.prompt = ""
-        with open("gpt_4v_prompt.txt", "r") as f:
-            self.prompt = f.read()
-
        self.trajectory = [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
-                        "text": self.prompt
+                        "text": SYS_PROMPT
                    },
                ]
            }
@@ -56,6 +87,12 @@ class GPT4v_Agent:
                }
            ]
        })
+        traj_to_show = []
+        for i in range(len(self.trajectory)):
+            traj_to_show.append(self.trajectory[i]["content"][0]["text"])
+            if len(self.trajectory[i]["content"]) > 1:
+                traj_to_show.append("screenshot_obs")
+        print("Trajectory:", traj_to_show)
        payload = {
            "model": self.model,
            "messages": self.trajectory,
@@ -63,11 +100,15 @@ class GPT4v_Agent:
        }
        response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)

-        action = self.parse_action(response.json()['choices'][0]['message']['content'])
+        try:
+            actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
+        except:
+            print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
+            actions = None

-        return action
+        return actions

-    def parse_action(self, response: str):
+    def parse_actions(self, response: str):
        # response example
        """
        ```json
@@ -79,12 +120,7 @@ class GPT4v_Agent:
        """

        # parse from the response
-        if response.startswith("```json"):
-            action = json5.loads(response[7:-3])
-        elif response.startswith("```"):
-            action = json5.loads(response[3:-3])
-        else:
-            action = json5.loads(response)
+        actions = parse_actions_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
@@ -98,25 +134,28 @@ class GPT4v_Agent:
        })

        # parse action
-        parsed_action = {}
-        action_type = Action[action['action_type']].value
-        parsed_action["action_type"] = action_type
+        parsed_actions = []
+        for action in actions:
+            parsed_action = {}
+            action_type = Action[action['action_type']].value
+            parsed_action["action_type"] = action_type

-        if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
-            parsed_action["click_type"] = MouseClick[action['click_type']].value
+            if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
+                parsed_action["click_type"] = MouseClick[action['click_type']].value

-        if action_type == Action.MOUSE_MOVE.value:
-            parsed_action["x"] = action["x"]
-            parsed_action["y"] = action["y"]
+            if action_type == Action.MOUSE_MOVE.value:
+                parsed_action["x"] = action["x"]
+                parsed_action["y"] = action["y"]

-        # fixme: could these two actions be merged??
-        if action_type == Action.KEY.value:
-            parsed_action["key"] = [ord(c) for c in action["key"]]
+            if action_type == Action.KEY.value:
+                parsed_action["key"] = action["key"]  # handle the condition of single key and multiple keys

-        if action_type == Action.TYPE.value:
-            parsed_action["text"] = [ord(c) for c in action["text"]]
+            if action_type == Action.TYPE.value:
+                parsed_action["text"] = action["text"]

-        return parsed_action
+            parsed_actions.append(parsed_action)
+
+        return parsed_actions


 if __name__ == '__main__':
@@ -125,4 +164,3 @@ if __name__ == '__main__':

    agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
    print(agent.predict(obs="stackoverflow.png"))
-
--- a/mm_agents/gpt_4v_prompt_action.py
+++ b/mm_agents/gpt_4v_prompt_action.py
@@ -0,0 +1,54 @@
+SYS_PROMPT = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
+Here is the description of the action space:
+
+Firstly you need to predict the class of your action, select from one below:
+- **MOUSE_MOVE**: move the mouse to a specific position
+- **CLICK**: click on the screen
+- **MOUSE_DOWN**: press the mouse button
+- **MOUSE_UP**: release the mouse button
+- **KEY**: press a key on the keyboard
+- **KEY_DOWN**: press a key on the keyboard
+- **KEY_UP**: release a key on the keyboard
+- **TYPE**: type a string on the keyboard
+
+Then you need to predict the parameters of your action:
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+for example, format as:
+```
+{
+  "action_type": "MOUSE_MOVE",
+  "x": 1319.11,
+  "y": 65.06
+}
+```
+- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
+for example, format as:
+```
+{
+  "action_type": "CLICK",
+  "click_type": "LEFT"
+}
+```
+- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
+for example, format as:
+```
+{
+  "action_type": "KEY",
+  "key": "ctrl+c"
+}
+```
+- For TYPE, you need to specify the text you want to type
+for example, format as:
+```
+{
+  "action_type": "TYPE",
+  "text": "hello world"
+}
+```
+
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You can predict multiple actions at one step, but you should only return one action for each step.
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+"""
--- a/mm_agents/gpt_4v_prompt_code.py
+++ b/mm_agents/gpt_4v_prompt_code.py
@@ -0,0 +1,8 @@
+SYS_PROMPT = """
+You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
+For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
+
+You are required to use `pyautogui` to perform the action. 
+Return one line or multiple lines of python code to perform the action each time, be time efficient.
+Return `None` if you cannot perform the action.
+"""
--- a/mm_agents/sam_test.py
+++ b/mm_agents/sam_test.py
@@ -0,0 +1,124 @@
+import torch
+from PIL import Image
+import requests
+from transformers import SamModel, SamProcessor
+import numpy as np
+import matplotlib.pyplot as plt
+import os
+os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
+
+def show_mask(mask, ax, random_color=False):
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)
+    else:
+        color = np.array([30 / 255, 144 / 255, 255 / 255, 0.6])
+    h, w = mask.shape[-2:]
+    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)
+    ax.imshow(mask_image)
+
+
+def show_box(box, ax):
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0, 0, 0, 0), lw=2))
+
+
+def show_boxes_on_image(raw_image, boxes):
+    plt.figure(figsize=(10, 10))
+    plt.imshow(raw_image)
+    for box in boxes:
+        show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+
+
+def show_points_on_image(raw_image, input_points, input_labels=None):
+    plt.figure(figsize=(10, 10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+        labels = np.ones_like(input_points[:, 0])
+    else:
+        labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    plt.axis('on')
+    plt.show()
+
+
+def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
+    plt.figure(figsize=(10, 10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+        labels = np.ones_like(input_points[:, 0])
+    else:
+        labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    for box in boxes:
+        show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+
+
+def show_points_and_boxes_on_image(raw_image, boxes, input_points, input_labels=None):
+    plt.figure(figsize=(10, 10))
+    plt.imshow(raw_image)
+    input_points = np.array(input_points)
+    if input_labels is None:
+        labels = np.ones_like(input_points[:, 0])
+    else:
+        labels = np.array(input_labels)
+    show_points(input_points, labels, plt.gca())
+    for box in boxes:
+        show_box(box, plt.gca())
+    plt.axis('on')
+    plt.show()
+
+
+def show_points(coords, labels, ax, marker_size=375):
+    pos_points = coords[labels == 1]
+    neg_points = coords[labels == 0]
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white',
+               linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white',
+               linewidth=1.25)
+
+
+def show_masks_on_image(raw_image, masks, scores):
+    if len(masks.shape) == 4:
+        masks = masks.squeeze()
+    if scores.shape[0] == 1:
+        scores = scores.squeeze()
+
+    nb_predictions = scores.shape[-1]
+    fig, axes = plt.subplots(1, nb_predictions, figsize=(15, 15))
+
+    for i, (mask, score) in enumerate(zip(masks, scores)):
+        mask = mask.cpu().detach()
+        axes[i].imshow(np.array(raw_image))
+        show_mask(mask, axes[i])
+        axes[i].title.set_text(f"Mask {i + 1}, Score: {score.item():.3f}")
+        axes[i].axis("off")
+    plt.show()
+
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
+processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
+
+img_url = "https://huggingface.co/ybelkada/segment-anything/resolve/main/assets/car.png"
+raw_image = Image.open(requests.get(img_url, stream=True).raw).convert("RGB")
+
+plt.imshow(raw_image)
+
+inputs = processor(raw_image, return_tensors="pt").to(device)
+with torch.no_grad():
+    outputs = model(**inputs)
+
+masks = processor.image_processor.post_process_masks(
+    outputs.pred_masks.cpu(), inputs["original_sizes"].cpu(), inputs["reshaped_input_sizes"].cpu()
+)
+
+
+scores = outputs.iou_scores
+show_masks_on_image(raw_image, masks[0], scores)