import os from pprint import pprint from desktop_env.envs.desktop_env import DesktopEnv, Action, MouseClick from mm_agents.gpt_4v_agent import GPT4v_Agent import uuid def gpt_4v_agent(): api_key = os.environ.get("OPENAI_API_KEY") # meta_info = { # "instruction": "Open WSJ website to get latest news", # "task_name": "open_wsj", # "snapshot_path": "base", # } meta_info = { "instruction": "Clear the recycle bin", "task_name": "clean_recycle_bin", "snapshot_path": "base", } agent = GPT4v_Agent(api_key=api_key, instruction=meta_info["instruction"]) env = DesktopEnv( path_to_vm=r"""C:\Users\tianbaox\Documents\Virtual Machines\Win10\Win10.vmx""", # automitically load the snapshot and start the vm # path_to_vm="/home/yuri/vmware/Ubuntu 64-bit/Ubuntu 64-bit.vmx", snapshot_path="base", username="tianbaox", password="951753", # host="192.168.7.128", host="http://192.168.13.128:5000", vm_os="windows" ) # reset the environment to certain snapshot observation = env.reset() done = False time_idx = 0 # create a file_dir for this agent file_dir = os.path.join("observations", str(uuid.uuid4())) os.makedirs(file_dir, exist_ok=True) # save the meta_info with open(os.path.join(file_dir, "meta_info.json"), "w") as f: f.write(str(meta_info)) f.write("\n") while not done: actions = agent.predict(obs=observation) print("Actions:", actions) with open(os.path.join(file_dir, "obs_{}.png".format(time_idx)), "wb") as f: # copy the image in the path of observation to the file with open(observation, "rb") as image_file: f.write(image_file.read()) # save the actions with open(os.path.join(file_dir, "actions_{}.json".format(time_idx)), "w") as f: f.write(str(actions)) f.write("\n") time_idx += 1 observation, reward, done, info = env.step(actions) print("Observation:", observation) print("Reward:", reward) print("Info:", info) print("================================\n") if done: print("The episode is done.") break env.close() print("Environment closed.") if __name__ == "__main__": gpt_4v_agent()