sci-gui-agent-benchmark/gpt_4v_agent_exp.py

import os
from pprint import pprint
from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
from mm_agents.gpt_4v_agent import GPT4v_Agent
import uuid


def gpt_4v_agent():
    api_key = os.environ.get("OPENAI_API_KEY")

    meta_info = {
        "instruction": "Open WSJ website to get latest news",
        "task_name": "open_wsj",
        "snapshot_path": "base",
    }

    # meta_info = {
    #     "instruction": "Clear the recycle bin",
    #     "task_name": "clean_recycle_bin",
    #     "snapshot_path": "base",
    # }

    agent = GPT4v_Agent(api_key=api_key, instruction=meta_info["instruction"])
    env = DesktopEnv(
        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""",
        # automitically load the snapshot and start the vm
        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
        snapshot_path="base",
        #  host="192.168.7.128",
        host="http://192.168.13.128:5000",
    )

    # reset the environment to certain snapshot
    observation = env.reset()
    done = False
    time_idx = 0

    # create a file_dir for this agent
    file_dir = os.path.join("observations", str(uuid.uuid4()))
    os.makedirs(file_dir, exist_ok=True)

    # save the meta_info
    with open(os.path.join(file_dir, "meta_info.json"), "w") as f:
        f.write(str(meta_info))
        f.write("\n")

    while not done:
        actions = agent.predict(obs=observation)
        print("Actions:", actions)

        with open(os.path.join(file_dir, "obs_{}.png".format(time_idx)), "wb") as f:
            # copy the image in the path of observation to the file
            with open(observation, "rb") as image_file:
                f.write(image_file.read())

        # save the actions
        with open(os.path.join(file_dir, "actions_{}.json".format(time_idx)), "w") as f:
            f.write(str(actions))
            f.write("\n")

        time_idx += 1
        observation, reward, done, info = env.step(actions)

        print("Observation:", observation)
        print("Reward:", reward)
        print("Info:", info)

        print("================================\n")

        if done:
            print("The episode is done.")
            break

    env.close()
    print("Environment closed.")


if __name__ == "__main__":
    gpt_4v_agent()