import os from pprint import pprint from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick from mm_agents.gpt_4v_agent import GPT4v_Agent def gpt_4v_agent(): api_key = os.environ.get("OPENAI_API_KEY") agent = GPT4v_Agent(api_key=api_key, instruction="Clear the recycle bin.") env = DesktopEnv( path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm # path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx", username="tianbaox", password="951753", # host="192.168.7.128", host="http://192.168.13.128:5000", vm_os="windows" ) # reset the environment to certain snapshot observation = env.reset() done = False while not done: # todo: action needs to be redesigned, need to support multiple actions at one step action = agent.predict(obs=observation) print("Action:", action) # fixme: step not working observation, reward, done, info = env.step(action) print("Observation:", observation) print("Reward:", reward) print("Info:", info) print("================================\n") if done: print("The episode is done.") break env.close() print("Environment closed.") if __name__ == "__main__": gpt_4v_agent()