diff --git a/README.md b/README.md
index 815c35d..e699a2e 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
## Setup guide
+### For members of the team
1. Download OS image
1. Download kubuntu from
2. Download ubuntu from
@@ -22,7 +23,8 @@
2. `rm -rf ~/screenshot.png`
7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
-
+### For users of the environment
+todo
## Road map (Proposed)
diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 6b5e627..735bd44 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -1,10 +1,11 @@
import requests
import json
+
class PythonController:
def __init__(self, http_server: str):
self.http_server = http_server
-
+
def _execute_python_command(self, command: str) -> None:
payload = json.dumps({
"command": command
@@ -12,7 +13,7 @@ class PythonController:
headers = {
'Content-Type': 'application/json'
}
-
+
try:
response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
if response.status_code == 200:
@@ -22,10 +23,11 @@ class PythonController:
except requests.exceptions.RequestException as e:
print("An error occurred while trying to execute the command:", e)
+
# example usage
if __name__ == '__main__':
# replace with your actual server URL of the vm
- server_url = "http://192.168.7.129:5000"
+ server_url = "http://192.168.7.129:5000"
controller = PythonController(server_url)
# example commands
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 962d176..d7b1b98 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,3 +1,4 @@
+import os
from enum import Enum
from typing import Literal, List, Tuple
import subprocess
@@ -7,10 +8,13 @@ import time
import gymnasium as gym
from gymnasium import spaces
import numpy as np
-from PIL import Image
+import uuid
+
+from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
+ PythonMouseController
+from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
+ PythonKeyboardController
-from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
-from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
class Action(Enum):
CLICK = 0
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
class DesktopEnv(gym.Env):
"""DesktopEnv with OpenAI Gym interface."""
- def __init__(self, path_to_vm: str, username: str, password: str,
- host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
+ def __init__(
+ self,
+ path_to_vm: str,
+ username: str,
+ password: str = None,
+ host: str = "192.168.7.128:5000",
+ snapshot_path: str = "initial_state_with_env_set",
+ vm_os: VM_TYPE = "ubuntu"):
+ # The path to the vmx file of your vm
self.path_to_vm = path_to_vm
+
+ # username and password for your vm
self.username = username
self.password = password
+
self.host = host
self.snapshot_path = snapshot_path # todo: handling the logic of snapshot directory
+ # TODO: get the screen width and height from the vm, or standardize it
self.screen_width = 800
self.screen_height = 800
# Define the action and observation space
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
"text": spaces.MultiDiscrete([128] * 10) # max 10 characters, ASCII
})
- self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
+ self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
+ dtype=np.uint8)
# Additional setup
self.metadata = {'render.modes': ['rgb_array']}
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
return mouse_controller, keyboard_controller
def _start_emulator(self):
+ # fixme: check if the vm is running
while True:
try:
output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
else:
print("Starting VM...")
self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
- time.sleep(5)
+ time.sleep(10)
except subprocess.CalledProcessError as e:
print(f"Error executing command: {e.output.decode().strip()}")
def _execute_command(self, command: List[str]) -> None:
- process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
- stdout, stderr = process.communicate()
- if process.returncode != 0:
- print(f"Error executing command: {command}")
- return None
- else:
- return stdout.decode()
+ subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
def _save_state(self):
self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
def _get_screenshot(self):
- image_path = "./screenshot.png"
- self._execute_command(
- ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
- image_path])
+ # todo: hash it and store it in a temporary directory
+
+ random_uuid = str(uuid.uuid4())
+ os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
+ image_path = os.path.join("tmp", random_uuid, "screenshot.png")
+
+ if self.password:
+ self._execute_command(
+ ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
+ image_path])
+ else:
+ self._execute_command(
+ ["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
+
return image_path
def _get_obs(self):
screenshot_image_path = self._get_screenshot()
- with Image.open(screenshot_image_path) as img:
- return np.array(img)
+ return screenshot_image_path
def reset(self):
print("Resetting environment...")
print("Reverting to snapshot to {}...".format(self.snapshot_path))
self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+ time.sleep(5)
print("Starting emulator...")
self._start_emulator()
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
elif click == MouseClick.WHEEL_DOWN:
self.mouse_controller.scroll_down()
elif action_type == Action.MOUSE_MOVE:
- self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
+ self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
elif action_type == Action.KEY:
key_sequence = ''.join(map(chr, action['key'])) # Convert integer array to string
self.keyboard_controller.key(key_sequence)
diff --git a/desktop_env/windows_server/main.py b/desktop_env/windows_server/main.py
index 56b7fc2..467b40e 100644
--- a/desktop_env/windows_server/main.py
+++ b/desktop_env/windows_server/main.py
@@ -11,13 +11,9 @@ def execute_command():
# Execute the command without any safety checks.
try:
- process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- stdout, stderr = process.communicate()
-
+ subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
return jsonify({
'status': 'success',
- 'output': stdout.decode(),
- 'error': stderr.decode()
})
except Exception as e:
return jsonify({
diff --git a/gpt_4v_agent_exp.py b/gpt_4v_agent_exp.py
new file mode 100644
index 0000000..fe78970
--- /dev/null
+++ b/gpt_4v_agent_exp.py
@@ -0,0 +1,47 @@
+import os
+from pprint import pprint
+from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
+from mm_agents.gpt_4v_agent import GPT4v_Agent
+
+
+def gpt_4v_agent():
+ api_key = os.environ.get("OPENAI_API_KEY")
+ agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
+ env = DesktopEnv(
+ path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
+ # path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+ username="tianbaox",
+ password="951753",
+ # host="192.168.7.128",
+ host="http://192.168.13.128:5000",
+ vm_os="windows"
+ )
+
+ # reset the environment to certain snapshot
+ observation = env.reset()
+ done = False
+
+ while not done:
+ # todo: action needs to be redesigned, need to support multiple actions at one step
+ action = agent.predict(obs=observation)
+ print("Action:", action)
+
+
+ # fixme: step not working
+ observation, reward, done, info = env.step(action)
+ print("Observation:", observation)
+ print("Reward:", reward)
+ print("Info:", info)
+
+ print("================================\n")
+
+ if done:
+ print("The episode is done.")
+ break
+
+ env.close()
+ print("Environment closed.")
+
+
+if __name__ == "__main__":
+ gpt_4v_agent()
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 663bf1e..fdbf4c7 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,9 +1,10 @@
import os
+import re
import base64
from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
import requests
-
+from mm_agents.gpt_4v_prompt import SYS_PROMPT
# Function to encode the image
def encode_image(image_path):
@@ -11,6 +12,32 @@ def encode_image(image_path):
return base64.b64encode(image_file.read()).decode('utf-8')
+def parse_action_from_string(input_string):
+ # Search for a JSON string within the input string
+ matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+ if matches:
+ # Assuming there's only one match, parse the JSON string into a dictionary
+ try:
+ action_dict = json.loads(matches[0])
+ return action_dict
+ except json.JSONDecodeError as e:
+ return f"Failed to parse JSON: {e}"
+ else:
+ matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+ if matches:
+ # Assuming there's only one match, parse the JSON string into a dictionary
+ try:
+ action_dict = json.loads(matches[0])
+ return action_dict
+ except json.JSONDecodeError as e:
+ return f"Failed to parse JSON: {e}"
+ else:
+ try:
+ action_dict = json.loads(input_string)
+ return action_dict
+ except json.JSONDecodeError as e:
+ raise ValueError("Invalid response format: " + input_string)
+
class GPT4v_Agent:
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
self.instruction = instruction
@@ -22,18 +49,13 @@ class GPT4v_Agent:
"Authorization": f"Bearer {api_key}"
}
- # load prompt from file
- self.prompt = ""
- with open("gpt_4v_prompt.txt", "r") as f:
- self.prompt = f.read()
-
self.trajectory = [
{
"role": "system",
"content": [
{
"type": "text",
- "text": self.prompt
+ "text": SYS_PROMPT
},
]
}
@@ -79,12 +101,7 @@ class GPT4v_Agent:
"""
# parse from the response
- if response.startswith("```json"):
- action = json5.loads(response[7:-3])
- elif response.startswith("```"):
- action = json5.loads(response[3:-3])
- else:
- action = json5.loads(response)
+ action = parse_action_from_string(response)
# add action into the trajectory
self.trajectory.append({
diff --git a/mm_agents/gpt_4v_prompt.txt b/mm_agents/gpt_4v_prompt.py
similarity index 76%
rename from mm_agents/gpt_4v_prompt.txt
rename to mm_agents/gpt_4v_prompt.py
index 5fe9c7c..bfe5430 100644
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.py
@@ -1,3 +1,4 @@
+SYS_PROMPT = """
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
Here is the description of the action space:
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
- **TYPE**: type a string on the keyboard
Then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
for example, format as:
```
{
@@ -30,7 +31,7 @@ for example, format as:
"click_type": "LEFT"
}
```
-- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
+- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
{
@@ -49,4 +50,6 @@ for example, format as:
}
```
-For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
\ No newline at end of file
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+"""
\ No newline at end of file
diff --git a/screenshot.png b/screenshot.png
index 0ea0c0f..a0b20d0 100644
Binary files a/screenshot.png and b/screenshot.png differ