diff --git a/README.md b/README.md
index 815c35d..e699a2e 100644
--- a/README.md
+++ b/README.md
@@ -2,6 +2,7 @@
 
 ## Setup guide
 
+### For members of the team
 1. Download OS image
    1. Download kubuntu from <https://kubuntu.org/getkubuntu/>
    2. Download ubuntu from <https://ubuntu.com/download/desktop>
@@ -22,7 +23,8 @@
    2. `rm -rf ~/screenshot.png`
 7. Set up python and install [mouse](https://github.com/boppreh/mouse/) and [keyboard](https://github.com/jordansissel/xdotool)
 
-
+### For users of the environment
+todo
 
 ## Road map (Proposed)
 
diff --git a/desktop_env/controllers/python.py b/desktop_env/controllers/python.py
index 6b5e627..735bd44 100644
--- a/desktop_env/controllers/python.py
+++ b/desktop_env/controllers/python.py
@@ -1,10 +1,11 @@
 import requests
 import json
 
+
 class PythonController:
     def __init__(self, http_server: str):
         self.http_server = http_server
-    
+
     def _execute_python_command(self, command: str) -> None:
         payload = json.dumps({
             "command": command
@@ -12,7 +13,7 @@ class PythonController:
         headers = {
             'Content-Type': 'application/json'
         }
-        
+
         try:
             response = requests.post(self.http_server + "/execute", headers=headers, data=payload)
             if response.status_code == 200:
@@ -22,10 +23,11 @@ class PythonController:
         except requests.exceptions.RequestException as e:
             print("An error occurred while trying to execute the command:", e)
 
+
 # example usage
 if __name__ == '__main__':
     # replace with your actual server URL of the vm
-    server_url = "http://192.168.7.129:5000"  
+    server_url = "http://192.168.7.129:5000"
     controller = PythonController(server_url)
 
     # example commands    
diff --git a/desktop_env/envs/desktop_env.py b/desktop_env/envs/desktop_env.py
index 962d176..d7b1b98 100644
--- a/desktop_env/envs/desktop_env.py
+++ b/desktop_env/envs/desktop_env.py
@@ -1,3 +1,4 @@
+import os
 from enum import Enum
 from typing import Literal, List, Tuple
 import subprocess
@@ -7,10 +8,13 @@ import time
 import gymnasium as gym
 from gymnasium import spaces
 import numpy as np
-from PIL import Image
+import uuid
+
+from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, \
+    PythonMouseController
+from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, \
+    PythonKeyboardController
 
-from desktop_env.controllers.mouse import MouseClick, AbstractMouseController, XDoToolMouseController, PythonMouseController
-from desktop_env.controllers.keyboard import AbstractKeyboardController, XDoToolKeyboardController, PythonKeyboardController
 
 class Action(Enum):
     CLICK = 0
@@ -29,14 +33,25 @@ VM_TYPE = Literal['ubuntu', 'windows']
 class DesktopEnv(gym.Env):
     """DesktopEnv with OpenAI Gym interface."""
 
-    def __init__(self, path_to_vm: str, username: str, password: str,
-                 host: str, snapshot_path: str = "some_point_browser", vm_os: VM_TYPE = "ubuntu"):
+    def __init__(
+            self,
+            path_to_vm: str,
+            username: str,
+            password: str = None,
+            host: str = "192.168.7.128:5000",
+            snapshot_path: str = "initial_state_with_env_set",
+            vm_os: VM_TYPE = "ubuntu"):
+        # The path to the vmx file of your vm
         self.path_to_vm = path_to_vm
+
+        # username and password for your vm
         self.username = username
         self.password = password
+
         self.host = host
         self.snapshot_path = snapshot_path  # todo: handling the logic of snapshot directory
 
+        # TODO: get the screen width and height from the vm, or standardize it
         self.screen_width = 800
         self.screen_height = 800
         # Define the action and observation space
@@ -49,7 +64,8 @@ class DesktopEnv(gym.Env):
             "text": spaces.MultiDiscrete([128] * 10)  # max 10 characters, ASCII
         })
 
-        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3), dtype=np.uint8)
+        self.observation_space = spaces.Box(low=0, high=255, shape=(self.screen_width, self.screen_height, 3),
+                                            dtype=np.uint8)
 
         # Additional setup
         self.metadata = {'render.modes': ['rgb_array']}
@@ -75,6 +91,7 @@ class DesktopEnv(gym.Env):
         return mouse_controller, keyboard_controller
 
     def _start_emulator(self):
+        # fixme: check if the vm is running
         while True:
             try:
                 output = subprocess.check_output(f"vmrun -T ws list", shell=True, stderr=subprocess.STDOUT)
@@ -85,39 +102,43 @@ class DesktopEnv(gym.Env):
                 else:
                     print("Starting VM...")
                     self._execute_command(["vmrun", "-T", "ws", "start", self.path_to_vm])
-                    time.sleep(5)
+                    time.sleep(10)
             except subprocess.CalledProcessError as e:
                 print(f"Error executing command: {e.output.decode().strip()}")
 
     def _execute_command(self, command: List[str]) -> None:
-        process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
-        stdout, stderr = process.communicate()
-        if process.returncode != 0:
-            print(f"Error executing command: {command}")
-            return None
-        else:
-            return stdout.decode()
+        subprocess.run(command, shell=True, stderr=subprocess.STDOUT, timeout=60)
 
     def _save_state(self):
         self._execute_command(["vmrun", "-T", "ws" "snapshot", self.path_to_vm, self.snapshot_path])
 
     def _get_screenshot(self):
-        image_path = "./screenshot.png"
-        self._execute_command(
-            ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
-             image_path])
+        # todo: hash it and store it in a temporary directory
+
+        random_uuid = str(uuid.uuid4())
+        os.makedirs(os.path.join("tmp", random_uuid), exist_ok=True)
+        image_path = os.path.join("tmp", random_uuid, "screenshot.png")
+
+        if self.password:
+            self._execute_command(
+                ["vmrun", "-T", "ws", "-gu", self.username, "-gp", self.password, "captureScreen", self.path_to_vm,
+                 image_path])
+        else:
+            self._execute_command(
+                ["vmrun", "-T", "ws", "-gu", self.username, "captureScreen", self.path_to_vm, image_path])
+
         return image_path
 
     def _get_obs(self):
         screenshot_image_path = self._get_screenshot()
-        with Image.open(screenshot_image_path) as img:
-            return np.array(img)
+        return screenshot_image_path
 
     def reset(self):
         print("Resetting environment...")
 
         print("Reverting to snapshot to {}...".format(self.snapshot_path))
         self._execute_command(["vmrun", "-T", "ws", "revertToSnapshot", self.path_to_vm, self.snapshot_path])
+        time.sleep(5)
 
         print("Starting emulator...")
         self._start_emulator()
@@ -165,7 +186,7 @@ class DesktopEnv(gym.Env):
             elif click == MouseClick.WHEEL_DOWN:
                 self.mouse_controller.scroll_down()
         elif action_type == Action.MOUSE_MOVE:
-            self.mouse_controller.mouse_move(x = action['x'], y = action['y'])
+            self.mouse_controller.mouse_move(x=action['x'], y=action['y'])
         elif action_type == Action.KEY:
             key_sequence = ''.join(map(chr, action['key']))  # Convert integer array to string
             self.keyboard_controller.key(key_sequence)
diff --git a/desktop_env/windows_server/main.py b/desktop_env/windows_server/main.py
index 56b7fc2..467b40e 100644
--- a/desktop_env/windows_server/main.py
+++ b/desktop_env/windows_server/main.py
@@ -11,13 +11,9 @@ def execute_command():
 
     # Execute the command without any safety checks.
     try:
-        process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-        stdout, stderr = process.communicate()
-
+        subprocess.run(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         return jsonify({
             'status': 'success',
-            'output': stdout.decode(),
-            'error': stderr.decode()
         })
     except Exception as e:
         return jsonify({
diff --git a/gpt_4v_agent_exp.py b/gpt_4v_agent_exp.py
new file mode 100644
index 0000000..fe78970
--- /dev/null
+++ b/gpt_4v_agent_exp.py
@@ -0,0 +1,47 @@
+import os
+from pprint import pprint
+from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
+from mm_agents.gpt_4v_agent import GPT4v_Agent
+
+
+def gpt_4v_agent():
+    api_key = os.environ.get("OPENAI_API_KEY")
+    agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
+    env = DesktopEnv(
+        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
+        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+        username="tianbaox",
+        password="951753",
+        #  host="192.168.7.128",
+        host="http://192.168.13.128:5000",
+        vm_os="windows"
+    )
+
+    # reset the environment to certain snapshot
+    observation = env.reset()
+    done = False
+
+    while not done:
+        # todo: action needs to be redesigned, need to support multiple actions at one step
+        action = agent.predict(obs=observation)
+        print("Action:", action)
+
+
+        # fixme: step not working
+        observation, reward, done, info = env.step(action)
+        print("Observation:", observation)
+        print("Reward:", reward)
+        print("Info:", info)
+
+        print("================================\n")
+
+        if done:
+            print("The episode is done.")
+            break
+
+    env.close()
+    print("Environment closed.")
+
+
+if __name__ == "__main__":
+    gpt_4v_agent()
diff --git a/mm_agents/gpt_4v_agent.py b/mm_agents/gpt_4v_agent.py
index 663bf1e..fdbf4c7 100644
--- a/mm_agents/gpt_4v_agent.py
+++ b/mm_agents/gpt_4v_agent.py
@@ -1,9 +1,10 @@
 import os
+import re
 import base64
 from desktop_env.envs.desktop_env import Action, MouseClick
-import json5
+import json
 import requests
-
+from mm_agents.gpt_4v_prompt import SYS_PROMPT
 
 # Function to encode the image
 def encode_image(image_path):
@@ -11,6 +12,32 @@ def encode_image(image_path):
         return base64.b64encode(image_file.read()).decode('utf-8')
 
 
+def parse_action_from_string(input_string):
+    # Search for a JSON string within the input string
+    matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
+    if matches:
+        # Assuming there's only one match, parse the JSON string into a dictionary
+        try:
+            action_dict = json.loads(matches[0])
+            return action_dict
+        except json.JSONDecodeError as e:
+            return f"Failed to parse JSON: {e}"
+    else:
+        matches = re.findall(r'```\s+(.*?)\s+```', input_string, re.DOTALL)
+        if matches:
+            # Assuming there's only one match, parse the JSON string into a dictionary
+            try:
+                action_dict = json.loads(matches[0])
+                return action_dict
+            except json.JSONDecodeError as e:
+                return f"Failed to parse JSON: {e}"
+        else:
+            try:
+                action_dict = json.loads(input_string)
+                return action_dict
+            except json.JSONDecodeError as e:
+                raise ValueError("Invalid response format: " + input_string)
+
 class GPT4v_Agent:
     def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
         self.instruction = instruction
@@ -22,18 +49,13 @@ class GPT4v_Agent:
             "Authorization": f"Bearer {api_key}"
         }
 
-        # load prompt from file
-        self.prompt = ""
-        with open("gpt_4v_prompt.txt", "r") as f:
-            self.prompt = f.read()
-
         self.trajectory = [
             {
                 "role": "system",
                 "content": [
                     {
                         "type": "text",
-                        "text": self.prompt
+                        "text": SYS_PROMPT
                     },
                 ]
             }
@@ -79,12 +101,7 @@ class GPT4v_Agent:
         """
 
         # parse from the response
-        if response.startswith("```json"):
-            action = json5.loads(response[7:-3])
-        elif response.startswith("```"):
-            action = json5.loads(response[3:-3])
-        else:
-            action = json5.loads(response)
+        action = parse_action_from_string(response)
 
         # add action into the trajectory
         self.trajectory.append({
diff --git a/mm_agents/gpt_4v_prompt.txt b/mm_agents/gpt_4v_prompt.py
similarity index 76%
rename from mm_agents/gpt_4v_prompt.txt
rename to mm_agents/gpt_4v_prompt.py
index 5fe9c7c..bfe5430 100644
--- a/mm_agents/gpt_4v_prompt.txt
+++ b/mm_agents/gpt_4v_prompt.py
@@ -1,3 +1,4 @@
+SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image.
 Here is the description of the action space:
@@ -13,7 +14,7 @@ Firstly you need to predict the class of your action, select from one below:
 - **TYPE**: type a string on the keyboard
 
 Then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
 {
@@ -30,7 +31,7 @@ for example, format as:
   "click_type": "LEFT"
 }
 ```
-- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard, select from [A-Z, 0-9, F1-F12, ESC, TAB, ENTER, SPACE, BACKSPACE, SHIFT, CTRL, ALT, UP, DOWN, LEFT, RIGHT, CAPSLOCK, NUMLOCK, SCROLLLOCK, INSERT, DELETE, HOME, END, PAGEUP, PAGEDOWN]:
+- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
 for example, format as:
 ```
 {
@@ -49,4 +50,6 @@ for example, format as:
 }
 ```
 
-For every setup, you should only return the action_type and the parameters of your action as a dict, without any other things.
\ No newline at end of file
+For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
+You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
+"""
\ No newline at end of file
diff --git a/screenshot.png b/screenshot.png
index 0ea0c0f..a0b20d0 100644
Binary files a/screenshot.png and b/screenshot.png differ