Run through gpt_4v agent pipeline
This commit is contained in:
@@ -2,6 +2,7 @@
|
||||
|
||||
## Setup guide
|
||||
|
||||
### For members of the team
|
||||
1. Download OS image
|
||||
1. Download kubuntu from <https://kubuntu.org/getkubuntu/>
|
||||
2. Download ubuntu from <https://ubuntu.com/download/desktop>
|
||||
@@ -22,7 +23,8 @@
|
||||
2. `rm -rf ~/screenshot.png`
|
||||
7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
|
||||
|
||||
|
||||
### For users of the environment
|
||||
todo
|
||||
|
||||
## Road map (Proposed)
|
||||
|
||||
|
||||
@@ -1,10 +1,11 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
class PythonController:
|
||||
def __init__(self, http_server: str):
|
||||
self.http_server = http_server
|
||||
|
||||
|
||||
def _execute_python_command(self, command: str) -> None:
|
||||
payload = json.dumps({
|
||||
"command": command
|
||||
@@ -12,7 +13,7 @@ class PythonController:
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
@@ -22,10 +23,11 @@ class PythonController:
|
||||
except requests.exceptions.RequestException as e:
|
||||
print("An error occurred while trying to execute the command:", e)
|
||||
|
||||
|
||||
# example usage
|
||||
if __name__ == '__main__':
|
||||
# replace with your actual server URL of the vm
|
||||
server_url = "http://192.168.7.129:5000"
|
||||
server_url = "http://192.168.7.129:5000"
|
||||
controller = PythonController(server_url)
|
||||
|
||||
# example commands
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from enum import Enum
|
||||
from typing import Literal, List, Tuple
|
||||
import subprocess
|
||||
@@ -7,10 +8,13 @@ import time
|
||||
import gymnasium as gym
|
||||
from gymnasium import spaces
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import uuid
|
||||
|
||||
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
|
||||
PythonMouseController
|
||||
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
|
||||
PythonKeyboardController
|
||||
|
||||
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
|
||||
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
|
||||
|
||||
class Action(Enum):
|
||||
CLICK = 0
|
||||
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
|
||||
class DesktopEnv(gym.Env):
|
||||
"""DesktopEnv with OpenAI Gym interface."""
|
||||
|
||||
def __init__(self, path_to_vm: str, username: str, password: str,
|
||||
host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
|
||||
def __init__(
|
||||
self,
|
||||
path_to_vm: str,
|
||||
username: str,
|
||||
password: str = None,
|
||||
host: str = "192.168.7.128:5000",
|
||||
snapshot_path: str = "initial_state_with_env_set",
|
||||
vm_os: VM_TYPE = "ubuntu"):
|
||||
# The path to the vmx file of your vm
|
||||
self.path_to_vm = path_to_vm
|
||||
|
||||
# username and password for your vm
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
self.host = host
|
||||
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
|
||||
|
||||
# TODO: get the screen width and height from the vm, or standardize it
|
||||
self.screen_width = 800
|
||||
self.screen_height = 800
|
||||
# Define the action and observation space
|
||||
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
|
||||
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
|
||||
})
|
||||
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
|
||||
dtype=np.uint8)
|
||||
|
||||
# Additional setup
|
||||
self.metadata = {'render.modes': ['rgb_array']}
|
||||
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
|
||||
return mouse_controller, keyboard_controller
|
||||
|
||||
def _start_emulator(self):
|
||||
# fixme: check if the vm is running
|
||||
while True:
|
||||
try:
|
||||
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
|
||||
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
|
||||
else:
|
||||
print("Starting VM...")
|
||||
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
|
||||
time.sleep(5)
|
||||
time.sleep(10)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error executing command: {e.output.decode().strip()}")
|
||||
|
||||
def _execute_command(self, command: List[str]) -> None:
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
stdout, stderr = process.communicate()
|
||||
if process.returncode != 0:
|
||||
print(f"Error executing command: {command}")
|
||||
return None
|
||||
else:
|
||||
return stdout.decode()
|
||||
subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
|
||||
|
||||
def _save_state(self):
|
||||
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
|
||||
|
||||
def _get_screenshot(self):
|
||||
image_path = "./screenshot.png"
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
|
||||
image_path])
|
||||
# todo: hash it and store it in a temporary directory
|
||||
|
||||
random_uuid = str(uuid.uuid4())
|
||||
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
|
||||
image_path = os.path.join("tmp", random_uuid, "screenshot.png")
|
||||
|
||||
if self.password:
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
|
||||
image_path])
|
||||
else:
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
|
||||
|
||||
return image_path
|
||||
|
||||
def _get_obs(self):
|
||||
screenshot_image_path = self._get_screenshot()
|
||||
with Image.open(screenshot_image_path) as img:
|
||||
return np.array(img)
|
||||
return screenshot_image_path
|
||||
|
||||
def reset(self):
|
||||
print("Resetting environment...")
|
||||
|
||||
print("Reverting to snapshot to {}...".format(self.snapshot_path))
|
||||
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
|
||||
time.sleep(5)
|
||||
|
||||
print("Starting emulator...")
|
||||
self._start_emulator()
|
||||
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
|
||||
elif click == MouseClick.WHEEL_DOWN:
|
||||
self.mouse_controller.scroll_down()
|
||||
elif action_type == Action.MOUSE_MOVE:
|
||||
self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
|
||||
self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
|
||||
elif action_type == Action.KEY:
|
||||
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
|
||||
self.keyboard_controller.key(key_sequence)
|
||||
|
||||
@@ -11,13 +11,9 @@ def execute_command():
|
||||
|
||||
# Execute the command without any safety checks.
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
return jsonify({
|
||||
'status': 'success',
|
||||
'output': stdout.decode(),
|
||||
'error': stderr.decode()
|
||||
})
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
|
||||
47
gpt_4v_agent_exp.py
Normal file
47
gpt_4v_agent_exp.py
Normal file
@@ -0,0 +1,47 @@
|
||||
import os
|
||||
from pprint import pprint
|
||||
from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
|
||||
from mm_agents.gpt_4v_agent import GPT4v_Agent
|
||||
|
||||
|
||||
def gpt_4v_agent():
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
|
||||
env = DesktopEnv(
|
||||
path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
|
||||
# path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
|
||||
username="tianbaox",
|
||||
password="951753",
|
||||
# host="192.168.7.128",
|
||||
host="http://192.168.13.128:5000",
|
||||
vm_os="windows"
|
||||
)
|
||||
|
||||
# reset the environment to certain snapshot
|
||||
observation = env.reset()
|
||||
done = False
|
||||
|
||||
while not done:
|
||||
# todo: action needs to be redesigned, need to support multiple actions at one step
|
||||
action = agent.predict(obs=observation)
|
||||
print("Action:", action)
|
||||
|
||||
|
||||
# fixme: step not working
|
||||
observation, reward, done, info = env.step(action)
|
||||
print("Observation:", observation)
|
||||
print("Reward:", reward)
|
||||
print("Info:", info)
|
||||
|
||||
print("================================\n")
|
||||
|
||||
if done:
|
||||
print("The episode is done.")
|
||||
break
|
||||
|
||||
env.close()
|
||||
print("Environment closed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
gpt_4v_agent()
|
||||
@@ -1,9 +1,10 @@
|
||||
import os
|
||||
import re
|
||||
import base64
|
||||
from desktop_env.envs.desktop_env import Action, MouseClick
|
||||
import json5
|
||||
import json
|
||||
import requests
|
||||
|
||||
from mm_agents.gpt_4v_prompt import SYS_PROMPT
|
||||
|
||||
# Function to encode the image
|
||||
def encode_image(image_path):
|
||||
@@ -11,6 +12,32 @@ def encode_image(image_path):
|
||||
return base64.b64encode(image_file.read()).decode('utf-8')
|
||||
|
||||
|
||||
def parse_action_from_string(input_string):
|
||||
# Search for a JSON string within the input string
|
||||
matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
|
||||
if matches:
|
||||
# Assuming there's only one match, parse the JSON string into a dictionary
|
||||
try:
|
||||
action_dict = json.loads(matches[0])
|
||||
return action_dict
|
||||
except json.JSONDecodeError as e:
|
||||
return f"Failed to parse JSON: {e}"
|
||||
else:
|
||||
matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
|
||||
if matches:
|
||||
# Assuming there's only one match, parse the JSON string into a dictionary
|
||||
try:
|
||||
action_dict = json.loads(matches[0])
|
||||
return action_dict
|
||||
except json.JSONDecodeError as e:
|
||||
return f"Failed to parse JSON: {e}"
|
||||
else:
|
||||
try:
|
||||
action_dict = json.loads(input_string)
|
||||
return action_dict
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError("Invalid response format: " + input_string)
|
||||
|
||||
class GPT4v_Agent:
|
||||
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
|
||||
self.instruction = instruction
|
||||
@@ -22,18 +49,13 @@ class GPT4v_Agent:
|
||||
"Authorization": f"Bearer {api_key}"
|
||||
}
|
||||
|
||||
# load prompt from file
|
||||
self.prompt = ""
|
||||
with open("gpt_4v_prompt.txt", "r") as f:
|
||||
self.prompt = f.read()
|
||||
|
||||
self.trajectory = [
|
||||
{
|
||||
"role": "system",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.prompt
|
||||
"text": SYS_PROMPT
|
||||
},
|
||||
]
|
||||
}
|
||||
@@ -79,12 +101,7 @@ class GPT4v_Agent:
|
||||
"""
|
||||
|
||||
# parse from the response
|
||||
if response.startswith("```json"):
|
||||
action = json5.loads(response[7:-3])
|
||||
elif response.startswith("```"):
|
||||
action = json5.loads(response[3:-3])
|
||||
else:
|
||||
action = json5.loads(response)
|
||||
action = parse_action_from_string(response)
|
||||
|
||||
# add action into the trajectory
|
||||
self.trajectory.append({
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
SYS_PROMPT = """
|
||||
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
|
||||
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
|
||||
Here is the description of the action space:
|
||||
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
|
||||
- **TYPE**: type a string on the keyboard
|
||||
|
||||
Then you need to predict the parameters of your action:
|
||||
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
|
||||
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
@@ -30,7 +31,7 @@ for example, format as:
|
||||
"click_type": "LEFT"
|
||||
}
|
||||
```
|
||||
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
|
||||
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
@@ -49,4 +50,6 @@ for example, format as:
|
||||
}
|
||||
```
|
||||
|
||||
For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
|
||||
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
|
||||
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
|
||||
"""
|
||||
BIN
screenshot.png
BIN
screenshot.png
Binary file not shown.
|
Before Width: | Height: | Size: 356 KiB After Width: | Height: | Size: 826 KiB |
Reference in New Issue
Block a user