sci-gui-agent-benchmark/gpt_4v_agent_exp.py

import os
from pprint import pprint
from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick
from mm_agents.gpt_4v_agent import GPT4v_Agent


def gpt_4v_agent():
    api_key = os.environ.get("OPENAI_API_KEY")
    agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.")
    env = DesktopEnv(
        path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm
        #  path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx",
        username="tianbaox",
        password="951753",
        #  host="192.168.7.128",
        host="http://192.168.13.128:5000",
        vm_os="windows"
    )

    # reset the environment to certain snapshot
    observation = env.reset()
    done = False

    while not done:
        # todo: action needs to be redesigned, need to support multiple actions at one step
        action = agent.predict(obs=observation)
        print("Action:", action)


        # fixme: step not working
        observation, reward, done, info = env.step(action)
        print("Observation:", observation)
        print("Reward:", reward)
        print("Info:", info)

        print("================================\n")

        if done:
            print("The episode is done.")
            break

    env.close()
    print("Environment closed.")


if __name__ == "__main__":
    gpt_4v_agent()