Run through gpt_4v agent pipeline

This commit is contained in:
Timothyxxx
2023-11-29 20:21:57 +08:00
parent 28c6edd6b3
commit 3d0d9d7758
8 changed files with 135 additions and 47 deletions

View File

@@ -1,10 +1,11 @@
import requests
import json
class PythonController:
def __init__(self, http_server: str):
self.http_server = http_server
def _execute_python_command(self, command: str) -> None:
payload = json.dumps({
"command": command
@@ -12,7 +13,7 @@ class PythonController:
headers = {
'Content-Type': 'application/json'
}
try:
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
if response.status_code == 200:
@@ -22,10 +23,11 @@ class PythonController:
except requests.exceptions.RequestException as e:
print("An error occurred while trying to execute the command:", e)
# example usage
if __name__ == '__main__':
# replace with your actual server URL of the vm
server_url = "http://192.168.7.129:5000"
server_url = "http://192.168.7.129:5000"
controller = PythonController(server_url)
# example commands

View File

@@ -1,3 +1,4 @@
import os
from enum import Enum
from typing import Literal, List, Tuple
import subprocess
@@ -7,10 +8,13 @@ import time
import gymnasium as gym
from gymnasium import spaces
import numpy as np
from PIL import Image
import uuid
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
PythonMouseController
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
PythonKeyboardController
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
class Action(Enum):
CLICK = 0
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
def __init__(self, path_to_vm: str, username: str, password: str,
host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
def __init__(
self,
path_to_vm: str,
username: str,
password: str = None,
host: str = "192.168.7.128:5000",
snapshot_path: str = "initial_state_with_env_set",
vm_os: VM_TYPE = "ubuntu"):
# The path to the vmx file of your vm
self.path_to_vm = path_to_vm
# username and password for your vm
self.username = username
self.password = password
self.host = host
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
# TODO: get the screen width and height from the vm, or standardize it
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
})
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
dtype=np.uint8)
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
return mouse_controller, keyboard_controller
def _start_emulator(self):
# fixme: check if the vm is running
while True:
try:
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
else:
print("Starting VM...")
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
time.sleep(5)
time.sleep(10)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
def _execute_command(self, command: List[str]) -> None:
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
stdout, stderr = process.communicate()
if process.returncode != 0:
print(f"Error executing command: {command}")
return None
else:
return stdout.decode()
subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
def _save_state(self):
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
def _get_screenshot(self):
image_path = "./screenshot.png"
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
image_path])
# todo: hash it and store it in a temporary directory
random_uuid = str(uuid.uuid4())
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
image_path = os.path.join("tmp", random_uuid, "screenshot.png")
if self.password:
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
image_path])
else:
self._execute_command(
["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
return image_path
def _get_obs(self):
screenshot_image_path = self._get_screenshot()
with Image.open(screenshot_image_path) as img:
return np.array(img)
return screenshot_image_path
def reset(self):
print("Resetting environment...")
print("Reverting to snapshot to {}...".format(self.snapshot_path))
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
time.sleep(5)
print("Starting emulator...")
self._start_emulator()
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_MOVE:
self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
elif action_type == Action.KEY:
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
self.keyboard_controller.key(key_sequence)

View File

@@ -11,13 +11,9 @@ def execute_command():
# Execute the command without any safety checks.
try:
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return jsonify({
'status': 'success',
'output': stdout.decode(),
'error': stderr.decode()
})
except Exception as e:
return jsonify({