Run through gpt_4v agent pipeline

2023-11-29 20:21:57 +08:00
parent 28c6edd6b3
commit 3d0d9d7758
8 changed files with 135 additions and 47 deletions
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@

 ## Setup guide

+### For members of the team
 1. Download OS image
   1. Download kubuntu from <https://kubuntu.org/getkubuntu/>
   2. Download ubuntu from <https://ubuntu.com/download/desktop>
@@ -22,7 +23,8 @@
   2. `rm -rf ~/screenshot.png`
 7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)

-
+### For users of the environment
+todo

 ## Road map (Proposed)

--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -1,10 +1,11 @@
 import requests
 import json

+
 class PythonController:
    def __init__(self, http_server: str):
        self.http_server = http_server
-    
+
    def _execute_python_command(self, command: str) -> None:
        payload = json.dumps({
            "command": command
@@ -12,7 +13,7 @@ class PythonController:
        headers = {
            'Content-Type': 'application/json'
        }
-        
+
        try:
            response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
            if response.status_code == 200:
@@ -22,10 +23,11 @@ class PythonController:
        except requests.exceptions.RequestException as e:
            print("An error occurred while trying to execute the command:", e)

+
 # example usage
 if __name__ == '__main__':
    # replace with your actual server URL of the vm
-    server_url = "http://192.168.7.129:5000"  
+    server_url = "http://192.168.7.129:5000"
    controller = PythonController(server_url)

    # example commands    
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,3 +1,4 @@
+import os
 from enum import Enum
 from typing import Literal, List, Tuple
 import subprocess
@@ -7,10 +8,13 @@ import time
 import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
-from PIL import Image
+import uuid
+
+from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
+    PythonMouseController
+from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
+    PythonKeyboardController

-from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
-from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController

 class Action(Enum):
    CLICK = 0
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
 class DesktopEnv(gym.Env):
    """DesktopEnv with OpenAI Gym interface."""

-    def __init__(self, path_to_vm: str, username: str, password: str,
-                 host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
+    def __init__(
+            self,
+            path_to_vm: str,
+            username: str,
+            password: str = None,
+            host: str = "192.168.7.128:5000",
+            snapshot_path: str = "initial_state_with_env_set",
+            vm_os: VM_TYPE = "ubuntu"):
+        # The path to the vmx file of your vm
        self.path_to_vm = path_to_vm
+
+        # username and password for your vm
        self.username = username
        self.password = password
+
        self.host = host
        self.snapshot_path = snapshot_path  # todo: handling the logic of snapshot directory

+        # TODO: get the screen width and height from the vm, or standardize it
        self.screen_width = 800
        self.screen_height = 800
        # Define the action and observation space
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
            "text": spaces.MultiDiscrete([128] * 10)  # max 10 characters, ASCII
        })

-        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
+                                            dtype=np.uint8)

        # Additional setup
        self.metadata = {'render.modes': ['rgb_array']}
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
        return mouse_controller, keyboard_controller

    def _start_emulator(self):
+        # fixme: check if the vm is running
        while True:
            try:
                output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
                else:
                    print("Starting VM...")
                    self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
-                    time.sleep(5)
+                    time.sleep(10)
            except subprocess.CalledProcessError as e:
                print(f"Error executing command: {e.output.decode().strip()}")

    def _execute_command(self, command: List[str]) -> None:
-        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        stdout, stderr = process.communicate()
-        if process.returncode != 0:
-            print(f"Error executing command: {command}")
-            return None
-        else:
-            return stdout.decode()
+        subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)

    def _save_state(self):
        self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])

    def _get_screenshot(self):
-        image_path = "./screenshot.png"
-        self._execute_command(
-            ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
-             image_path])
+        # todo: hash it and store it in a temporary directory
+
+        random_uuid = str(uuid.uuid4())
+        os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
+        image_path = os.path.join("tmp", random_uuid, "screenshot.png")
+
+        if self.password:
+            self._execute_command(
+                ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
+                 image_path])
+        else:
+            self._execute_command(
+                ["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
+
        return image_path

    def _get_obs(self):
        screenshot_image_path = self._get_screenshot()
-        with Image.open(screenshot_image_path) as img:
-            return np.array(img)
+        return screenshot_image_path

    def reset(self):
        print("Resetting environment...")

        print("Reverting to snapshot to {}...".format(self.snapshot_path))
        self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        time.sleep(5)

        print("Starting emulator...")
        self._start_emulator()
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
            elif click == MouseClick.WHEEL_DOWN:
                self.mouse_controller.scroll_down()
        elif action_type == Action.MOUSE_MOVE:
-            self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
+            self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
        elif action_type == Action.KEY:
            key_sequence = ''.join(map(chr, action['key']))  # Convert integer array to string
            self.keyboard_controller.key(key_sequence)
--- a/desktop_env/windows_server/main.py
+++ b/desktop_env/windows_server/main.py
@@ -11,13 +11,9 @@ def execute_command():

    # Execute the command without any safety checks.
    try:
-        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-
+        subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        return jsonify({
            'status': 'success',
-            'output': stdout.decode(),
-            'error': stderr.decode()
        })
    except Exception as e:
        return jsonify({
--- a/gpt_4v_agent_exp.py
+++ b/gpt_4v_agent_exp.py
@@ -0,0 +1,47 @@
+import os
+from pprint import pprint
+from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
+from mm_agents.gpt_4v_agent import GPT4v_Agent
+
+
+def gpt_4v_agent():
+    api_key = os.environ.get("OPENAI_API_KEY")
+    agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
+    env = DesktopEnv(
+        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
+        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+        username="tianbaox",
+        password="951753",
+        #  host="192.168.7.128",
+        host="http://192.168.13.128:5000",
+        vm_os="windows"
+    )
+
+    # reset the environment to certain snapshot
+    observation = env.reset()
+    done = False
+
+    while not done:
+        # todo: action needs to be redesigned, need to support multiple actions at one step
+        action = agent.predict(obs=observation)
+        print("Action:", action)
+
+
+        # fixme: step not working
+        observation, reward, done, info = env.step(action)
+        print("Observation:", observation)
+        print("Reward:", reward)
+        print("Info:", info)
+
+        print("================================\n")
+
+        if done:
+            print("The episode is done.")
+            break
+
+    env.close()
+    print("Environment closed.")
+
+
+if __name__ == "__main__":
+    gpt_4v_agent()
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,9 +1,10 @@
 import os
+import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
 import requests
-
+from mm_agents.gpt_4v_prompt import SYS_PROMPT

 # Function to encode the image
 def encode_image(image_path):
@@ -11,6 +12,32 @@ def encode_image(image_path):
        return base64.b64encode(image_file.read()).decode('utf-8')


+def parse_action_from_string(input_string):
+    # Search for a JSON string within the input string
+    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+    if matches:
+        # Assuming there's only one match, parse the JSON string into a dictionary
+        try:
+            action_dict = json.loads(matches[0])
+            return action_dict
+        except json.JSONDecodeError as e:
+            return f"Failed to parse JSON: {e}"
+    else:
+        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+        if matches:
+            # Assuming there's only one match, parse the JSON string into a dictionary
+            try:
+                action_dict = json.loads(matches[0])
+                return action_dict
+            except json.JSONDecodeError as e:
+                return f"Failed to parse JSON: {e}"
+        else:
+            try:
+                action_dict = json.loads(input_string)
+                return action_dict
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid response format: " + input_string)
+
 class GPT4v_Agent:
    def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
        self.instruction = instruction
@@ -22,18 +49,13 @@ class GPT4v_Agent:
            "Authorization": f"Bearer {api_key}"
        }

-        # load prompt from file
-        self.prompt = ""
-        with open("gpt_4v_prompt.txt", "r") as f:
-            self.prompt = f.read()
-
        self.trajectory = [
            {
                "role": "system",
                "content": [
                    {
                        "type": "text",
-                        "text": self.prompt
+                        "text": SYS_PROMPT
                    },
                ]
            }
@@ -79,12 +101,7 @@ class GPT4v_Agent:
        """

        # parse from the response
-        if response.startswith("```json"):
-            action = json5.loads(response[7:-3])
-        elif response.startswith("```"):
-            action = json5.loads(response[3:-3])
-        else:
-            action = json5.loads(response)
+        action = parse_action_from_string(response)

        # add action into the trajectory
        self.trajectory.append({
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.txt
@@ -1,3 +1,4 @@
+SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
 - **TYPE**: type a string on the keyboard

 Then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
 {
@@ -30,7 +31,7 @@ for example, format as:
  "click_type": "LEFT"
 }
 ```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
+- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
 for example, format as:
 ```
 {
@@ -49,4 +50,6 @@ for example, format as:
 }
 ```

-For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+"""
--- a/screenshot.png
+++ b/screenshot.png