Fix the width and height of vm, make agent perform more accurate

2023-11-30 12:10:41 +08:00
parent ecb62d7eb4
commit e52ba2ab13
4 changed files with 134 additions and 62 deletions
--- a/gpt_4v_agent_exp.py
+++ b/gpt_4v_agent_exp.py
@@ -2,14 +2,30 @@ import os
 from pprint import pprint
 from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
 from mm_agents.gpt_4v_agent import GPT4v_Agent
+import uuid


 def gpt_4v_agent():
    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
+
+    # meta_info = {
+    #     "instruction": "Open WSJ website to get latest news",
+    #     "task_name": "open_wsj",
+    #     "snapshot_path": "base",
+    # }
+
+    meta_info = {
+        "instruction": "Clear the recycle bin",
+        "task_name": "clean_recycle_bin",
+        "snapshot_path": "base",
+    }
+
+    agent = GPT4v_Agent(api_key=api_key, instruction=meta_info["instruction"])
    env = DesktopEnv(
-        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
+        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""",
+        # automitically load the snapshot and start the vm
        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
+        snapshot_path="base",
        username="tianbaox",
        password="951753",
        #  host="192.168.7.128",
@@ -20,15 +36,34 @@ def gpt_4v_agent():
    # reset the environment to certain snapshot
    observation = env.reset()
    done = False
+    time_idx = 0
+
+    # create a file_dir for this agent
+    file_dir = os.path.join("observations", str(uuid.uuid4()))
+    os.makedirs(file_dir, exist_ok=True)
+
+    # save the meta_info
+    with open(os.path.join(file_dir, "meta_info.json"), "w") as f:
+        f.write(str(meta_info))
+        f.write("\n")

    while not done:
-        # todo: action needs to be redesigned, need to support multiple actions at one step
-        action = agent.predict(obs=observation)
-        print("Action:", action)
+        actions = agent.predict(obs=observation)
+        print("Actions:", actions)

+        with open(os.path.join(file_dir, "obs_{}.png".format(time_idx)), "wb") as f:
+            # copy the image in the path of observation to the file
+            with open(observation, "rb") as image_file:
+                f.write(image_file.read())
+
+        # save the actions
+        with open(os.path.join(file_dir, "actions_{}.json".format(time_idx)), "w") as f:
+            f.write(str(actions))
+            f.write("\n")
+
+        time_idx += 1
+        observation, reward, done, info = env.step(actions)

-        # fixme: step not working
-        observation, reward, done, info = env.step(action)
        print("Observation:", observation)
        print("Reward:", reward)
        print("Info:", info)