Run through gpt_4v agent pipeline

This commit is contained in:
Timothyxxx
2023-11-29 20:21:57 +08:00
parent 28c6edd6b3
commit 3d0d9d7758
8 changed files with 135 additions and 47 deletions

View File

@@ -2,6 +2,7 @@
## Setup guide
### For members of the team
1. Download OS image
1. Download kubuntu from <https://kubuntu.org/getkubuntu/>
2. Download ubuntu from <https://ubuntu.com/download/desktop>
@@ -22,7 +23,8 @@
2. `rm -rf ~/screenshot.png`
7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
### For users of the environment
todo
## Road map (Proposed)

View File

@@ -1,10 +1,11 @@
import requests
import json
class PythonController:
def __init__(self, http_server: str):
self.http_server = http_server
def _execute_python_command(self, command: str) -> None:
payload = json.dumps({
"command": command
@@ -12,7 +13,7 @@ class PythonController:
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
if response.status_code == 200:
@@ -22,10 +23,11 @@ class PythonController:
except requests.exceptions.RequestException as e:
print("An error occurred while trying to execute the command:", e)
# example usage
if __name__ == '__main__':
# replace with your actual server URL of the vm
server_url = "http://192.168.7.129:5000"
server_url = "http://192.168.7.129:5000"
controller = PythonController(server_url)
# example commands

View File

@@ -1,3 +1,4 @@
import os
from enum import Enum
from typing import Literal, List, Tuple
import subprocess
@@ -7,10 +8,13 @@ import time
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from PIL import Image
import uuid
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
PythonMouseController
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
PythonKeyboardController
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
class Action(Enum):
CLICK = 0
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
def __init__(self, path_to_vm: str, username: str, password: str,
host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
def __init__(
self,
path_to_vm: str,
username: str,
password: str = None,
host: str = "192.168.7.128:5000",
snapshot_path: str = "initial_state_with_env_set",
vm_os: VM_TYPE = "ubuntu"):
# The path to the vmx file of your vm
self.path_to_vm = path_to_vm
# username and password for your vm
self.username = username
self.password = password
self.host = host
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
# TODO: get the screen width and height from the vm, or standardize it
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
})
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
dtype=np.uint8)
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
return mouse_controller, keyboard_controller
def _start_emulator(self):
# fixme: check if the vm is running
while True:
try:
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
else:
print("Starting VM...")
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
time.sleep(5)
time.sleep(10)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
def _execute_command(self, command: List[str]) -> None:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
return None
else:
return stdout.decode()
subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
def _save_state(self):
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
def _get_screenshot(self):
image_path = "./screenshot.png"
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
image_path])
# todo: hash it and store it in a temporary directory
random_uuid = str(uuid.uuid4())
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
image_path = os.path.join("tmp", random_uuid, "screenshot.png")
if self.password:
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
image_path])
else:
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
return image_path
def _get_obs(self):
screenshot_image_path = self._get_screenshot()
with Image.open(screenshot_image_path) as img:
return np.array(img)
return screenshot_image_path
def reset(self):
print("Resetting environment...")
print("Reverting to snapshot to {}...".format(self.snapshot_path))
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
time.sleep(5)
print("Starting emulator...")
self._start_emulator()
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_MOVE:
self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
elif action_type == Action.KEY:
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
self.keyboard_controller.key(key_sequence)

View File

@@ -11,13 +11,9 @@ def execute_command():
# Execute the command without any safety checks.
try:
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return jsonify({
'status': 'success',
'output': stdout.decode(),
'error': stderr.decode()
})
except Exception as e:
return jsonify({

47
gpt_4v_agent_exp.py Normal file
View File

@@ -0,0 +1,47 @@
import os
from pprint import pprint
from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
from mm_agents.gpt_4v_agent import GPT4v_Agent
def gpt_4v_agent():
api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
env = DesktopEnv(
path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
# path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
username="tianbaox",
password="951753",
# host="192.168.7.128",
host="http://192.168.13.128:5000",
vm_os="windows"
)
# reset the environment to certain snapshot
observation = env.reset()
done = False
while not done:
# todo: action needs to be redesigned, need to support multiple actions at one step
action = agent.predict(obs=observation)
print("Action:", action)
# fixme: step not working
observation, reward, done, info = env.step(action)
print("Observation:", observation)
print("Reward:", reward)
print("Info:", info)
print("================================\n")
if done:
print("The episode is done.")
break
env.close()
print("Environment closed.")
if __name__ == "__main__":
gpt_4v_agent()

View File

@@ -1,9 +1,10 @@
import os
import re
import base64
from desktop_env.envs.desktop_env import Action, MouseClick
import json5
import json
import requests
from mm_agents.gpt_4v_prompt import SYS_PROMPT
# Function to encode the image
def encode_image(image_path):
@@ -11,6 +12,32 @@ def encode_image(image_path):
return base64.b64encode(image_file.read()).decode('utf-8')
def parse_action_from_string(input_string):
# Search for a JSON string within the input string
matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
if matches:
# Assuming there's only one match, parse the JSON string into a dictionary
try:
action_dict = json.loads(matches[0])
return action_dict
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
if matches:
# Assuming there's only one match, parse the JSON string into a dictionary
try:
action_dict = json.loads(matches[0])
return action_dict
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
try:
action_dict = json.loads(input_string)
return action_dict
except json.JSONDecodeError as e:
raise ValueError("Invalid response format: " + input_string)
class GPT4v_Agent:
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
self.instruction = instruction
@@ -22,18 +49,13 @@ class GPT4v_Agent:
"Authorization": f"Bearer {api_key}"
}
# load prompt from file
self.prompt = ""
with open("gpt_4v_prompt.txt", "r") as f:
self.prompt = f.read()
self.trajectory = [
{
"role": "system",
"content": [
{
"type": "text",
"text": self.prompt
"text": SYS_PROMPT
},
]
}
@@ -79,12 +101,7 @@ class GPT4v_Agent:
"""
# parse from the response
if response.startswith("```json"):
action = json5.loads(response[7:-3])
elif response.startswith("```"):
action = json5.loads(response[3:-3])
else:
action = json5.loads(response)
action = parse_action_from_string(response)
# add action into the trajectory
self.trajectory.append({

View File

@@ -1,3 +1,4 @@
SYS_PROMPT = """
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
Here is the description of the action space:
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
- **TYPE**: type a string on the keyboard
Then you need to predict the parameters of your action:
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
for example, format as:
```
{
@@ -30,7 +31,7 @@ for example, format as:
"click_type": "LEFT"
}
```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
{
@@ -49,4 +50,6 @@ for example, format as:
}
```
For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
"""

Binary file not shown.

Before

Width:  |  Height:  |  Size: 356 KiB

After

Width:  |  Height:  |  Size: 826 KiB