Run through gpt_4v agent pipeline
This commit is contained in:
@@ -1,10 +1,11 @@
|
||||
import requests
|
||||
import json
|
||||
|
||||
|
||||
class PythonController:
|
||||
def __init__(self, http_server: str):
|
||||
self.http_server = http_server
|
||||
|
||||
|
||||
def _execute_python_command(self, command: str) -> None:
|
||||
payload = json.dumps({
|
||||
"command": command
|
||||
@@ -12,7 +13,7 @@ class PythonController:
|
||||
headers = {
|
||||
'Content-Type': 'application/json'
|
||||
}
|
||||
|
||||
|
||||
try:
|
||||
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
|
||||
if response.status_code == 200:
|
||||
@@ -22,10 +23,11 @@ class PythonController:
|
||||
except requests.exceptions.RequestException as e:
|
||||
print("An error occurred while trying to execute the command:", e)
|
||||
|
||||
|
||||
# example usage
|
||||
if __name__ == '__main__':
|
||||
# replace with your actual server URL of the vm
|
||||
server_url = "http://192.168.7.129:5000"
|
||||
server_url = "http://192.168.7.129:5000"
|
||||
controller = PythonController(server_url)
|
||||
|
||||
# example commands
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
import os
|
||||
from enum import Enum
|
||||
from typing import Literal, List, Tuple
|
||||
import subprocess
|
||||
@@ -7,10 +8,13 @@ import time
|
||||
import gymnasium as gym
|
||||
from gymnasium import spaces
|
||||
import numpy as np
|
||||
from PIL import Image
|
||||
import uuid
|
||||
|
||||
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
|
||||
PythonMouseController
|
||||
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
|
||||
PythonKeyboardController
|
||||
|
||||
from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
|
||||
from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
|
||||
|
||||
class Action(Enum):
|
||||
CLICK = 0
|
||||
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
|
||||
class DesktopEnv(gym.Env):
|
||||
"""DesktopEnv with OpenAI Gym interface."""
|
||||
|
||||
def __init__(self, path_to_vm: str, username: str, password: str,
|
||||
host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
|
||||
def __init__(
|
||||
self,
|
||||
path_to_vm: str,
|
||||
username: str,
|
||||
password: str = None,
|
||||
host: str = "192.168.7.128:5000",
|
||||
snapshot_path: str = "initial_state_with_env_set",
|
||||
vm_os: VM_TYPE = "ubuntu"):
|
||||
# The path to the vmx file of your vm
|
||||
self.path_to_vm = path_to_vm
|
||||
|
||||
# username and password for your vm
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
self.host = host
|
||||
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
|
||||
|
||||
# TODO: get the screen width and height from the vm, or standardize it
|
||||
self.screen_width = 800
|
||||
self.screen_height = 800
|
||||
# Define the action and observation space
|
||||
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
|
||||
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
|
||||
})
|
||||
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
|
||||
self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
|
||||
dtype=np.uint8)
|
||||
|
||||
# Additional setup
|
||||
self.metadata = {'render.modes': ['rgb_array']}
|
||||
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
|
||||
return mouse_controller, keyboard_controller
|
||||
|
||||
def _start_emulator(self):
|
||||
# fixme: check if the vm is running
|
||||
while True:
|
||||
try:
|
||||
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
|
||||
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
|
||||
else:
|
||||
print("Starting VM...")
|
||||
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
|
||||
time.sleep(5)
|
||||
time.sleep(10)
|
||||
except subprocess.CalledProcessError as e:
|
||||
print(f"Error executing command: {e.output.decode().strip()}")
|
||||
|
||||
def _execute_command(self, command: List[str]) -> None:
|
||||
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
|
||||
stdout, stderr = process.communicate()
|
||||
if process.returncode != 0:
|
||||
print(f"Error executing command: {command}")
|
||||
return None
|
||||
else:
|
||||
return stdout.decode()
|
||||
subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
|
||||
|
||||
def _save_state(self):
|
||||
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
|
||||
|
||||
def _get_screenshot(self):
|
||||
image_path = "./screenshot.png"
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
|
||||
image_path])
|
||||
# todo: hash it and store it in a temporary directory
|
||||
|
||||
random_uuid = str(uuid.uuid4())
|
||||
os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
|
||||
image_path = os.path.join("tmp", random_uuid, "screenshot.png")
|
||||
|
||||
if self.password:
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
|
||||
image_path])
|
||||
else:
|
||||
self._execute_command(
|
||||
["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
|
||||
|
||||
return image_path
|
||||
|
||||
def _get_obs(self):
|
||||
screenshot_image_path = self._get_screenshot()
|
||||
with Image.open(screenshot_image_path) as img:
|
||||
return np.array(img)
|
||||
return screenshot_image_path
|
||||
|
||||
def reset(self):
|
||||
print("Resetting environment...")
|
||||
|
||||
print("Reverting to snapshot to {}...".format(self.snapshot_path))
|
||||
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
|
||||
time.sleep(5)
|
||||
|
||||
print("Starting emulator...")
|
||||
self._start_emulator()
|
||||
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
|
||||
elif click == MouseClick.WHEEL_DOWN:
|
||||
self.mouse_controller.scroll_down()
|
||||
elif action_type == Action.MOUSE_MOVE:
|
||||
self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
|
||||
self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
|
||||
elif action_type == Action.KEY:
|
||||
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
|
||||
self.keyboard_controller.key(key_sequence)
|
||||
|
||||
@@ -11,13 +11,9 @@ def execute_command():
|
||||
|
||||
# Execute the command without any safety checks.
|
||||
try:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
stdout, stderr = process.communicate()
|
||||
|
||||
subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
return jsonify({
|
||||
'status': 'success',
|
||||
'output': stdout.decode(),
|
||||
'error': stderr.decode()
|
||||
})
|
||||
except Exception as e:
|
||||
return jsonify({
|
||||
|
||||
Reference in New Issue
Block a user