From 48a86d36cf84556933452dced412dc4b08261d65 Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Tue, 16 Jan 2024 12:15:21 +0800 Subject: [PATCH] Minor updates --- README.md | 9 +- experiment_pure_text.py | 131 +++++++++++++++++ mm_agents/gpt_4_prompt_action.py | 244 +++++++++++++++++++++++++++++++ mm_agents/gpt_4_prompt_code.py | 18 +++ 4 files changed, 401 insertions(+), 1 deletion(-) create mode 100644 experiment_pure_text.py create mode 100644 mm_agents/gpt_4_prompt_action.py create mode 100644 mm_agents/gpt_4_prompt_code.py diff --git a/README.md b/README.md index b7d56df..5fd8aa6 100644 --- a/README.md +++ b/README.md @@ -23,4 +23,11 @@ todo - [x] Error handling during file passing and file opening, etc. - [x] Add accessibility tree from the OS into the observation space - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation -- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient \ No newline at end of file +- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient + +## Road map of benchmark, tools and resources (Proposed) +- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree +- [ ] Annotate the steps of doing the task +- [ ] Build a website for the project +- [ ] Crawl all resources we explored from the internet, and make it easy to access +- [ ] Set up ways for community to contribute new examples diff --git a/experiment_pure_text.py b/experiment_pure_text.py new file mode 100644 index 0000000..011c7bf --- /dev/null +++ b/experiment_pure_text.py @@ -0,0 +1,131 @@ +import datetime +import json +import logging +import os +import sys + +from desktop_env.envs.desktop_env import DesktopEnv +from mm_agents.gpt_4_agent import GPT4_Agent +from mm_agents.gemini_pro_agent import GeminiPro_Agent + +# Logger Configs {{{ # +logger = logging.getLogger() +logger.setLevel(logging.DEBUG) + +datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + +file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8") +debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8") +stdout_handler = logging.StreamHandler(sys.stdout) +sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8") + +file_handler.setLevel(logging.INFO) +debug_handler.setLevel(logging.DEBUG) +stdout_handler.setLevel(logging.INFO) +sdebug_handler.setLevel(logging.DEBUG) + +formatter = logging.Formatter( + fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s") +file_handler.setFormatter(formatter) +debug_handler.setFormatter(formatter) +stdout_handler.setFormatter(formatter) +sdebug_handler.setFormatter(formatter) + +stdout_handler.addFilter(logging.Filter("desktopenv")) +sdebug_handler.addFilter(logging.Filter("desktopenv")) + +logger.addHandler(file_handler) +logger.addHandler(debug_handler) +logger.addHandler(stdout_handler) +logger.addHandler(sdebug_handler) +# }}} Logger Configs # + +logger = logging.getLogger("desktopenv.experiment") + +PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" + + +def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): + trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") + env = DesktopEnv( + path_to_vm=PATH_TO_VM, + action_space=agent.action_space, + task_config=example + ) + # reset the environment to certain snapshot + observation = env.reset() + done = False + step_num = 0 + + if recording: + # send a request to the server to start recording + env.controller.start_recording() + + while not done and step_num < max_steps: + actions = agent.predict(observation) + step_num += 1 + for action in actions: + # Capture the timestamp before executing the action + action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") + logger.info("Step %d: %s", step_num, action) + + observation, reward, done, info = env.step(action) + + logger.info("Reward: %.2f", reward) + logger.info("Done: %s", done) + logger.info("Info: %s", info) + + # Save screenshot and trajectory information + with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f: + with open(observation['screenshot'], "rb") as __f: + screenshot = __f.read() + _f.write(screenshot) + + with open(trajectory_recording_path, "a") as f: + f.write(json.dumps({ + "step_num": step_num, + "action_timestamp": action_timestamp, + "action": action, + "reward": reward, + "done": done, + "info": info, + "screenshot_file": f"step_{step_num}_{action_timestamp}.png" + })) + f.write("\n") + + if done: + logger.info("The episode is done.") + break + + if recording: + # send a request to the server to stop recording + env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4")) + + result = env.evaluate() + logger.info("Result: %.2f", result) + + # env.close() + logger.info("Environment closed.") + + +if __name__ == "__main__": + action_space = "pyautogui" + example_class = "chrome" + example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4" + + with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: + example = json.load(f) + example["snapshot"] = "exp_setup2" + + # api_key = os.environ.get("OPENAI_API_KEY") + # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + + api_key = os.environ.get("GENAI_API_KEY") + agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + + root_trajectory_dir = "exp_trajectory" + + example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) + os.makedirs(example_trajectory_dir, exist_ok=True) + + run_one_example(example, agent, 10, example_trajectory_dir) diff --git a/mm_agents/gpt_4_prompt_action.py b/mm_agents/gpt_4_prompt_action.py new file mode 100644 index 0000000..3019074 --- /dev/null +++ b/mm_agents/gpt_4_prompt_action.py @@ -0,0 +1,244 @@ +SYS_PROMPT = """ +You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. +For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters: +ACTION_SPACE = [ + { + "action_type": "MOVE_TO", + "note": "move the cursor to the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "CLICK", + "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + }, + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + }, + "num_clicks": { + "type": int, + "range": [1, 2, 3], + "optional": True, + }, + } + }, + { + "action_type": "MOUSE_DOWN", + "note": "press the left button if the button not specified, otherwise press the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "MOUSE_UP", + "note": "release the left button if the button not specified, otherwise release the specified button", + "parameters": { + "button": { + "type": str, + "range": ["left", "right", "middle"], + "optional": True, + } + } + }, + { + "action_type": "RIGHT_CLICK", + "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DOUBLE_CLICK", + "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": True, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": True, + } + } + }, + { + "action_type": "DRAG_TO", + "note": "drag the cursor to the specified position with the left button pressed", + "parameters": { + "x": { + "type": float, + "range": [0, X_MAX], + "optional": False, + }, + "y": { + "type": float, + "range": [0, Y_MAX], + "optional": False, + } + } + }, + { + "action_type": "SCROLL", + "note": "scroll the mouse wheel up or down", + "parameters": { + "dx": { + "type": int, + "range": None, + "optional": False, + }, + "dy": { + "type": int, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "TYPING", + "note": "type the specified text", + "parameters": { + "text": { + "type": str, + "range": None, + "optional": False, + } + } + }, + { + "action_type": "PRESS", + "note": "press the specified key and release it", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_DOWN", + "note": "press the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "KEY_UP", + "note": "release the specified key", + "parameters": { + "key": { + "type": str, + "range": KEYBOARD_KEYS, + "optional": False, + } + } + }, + { + "action_type": "HOTKEY", + "note": "press the specified key combination", + "parameters": { + "keys": { + "type": list, + "range": [KEYBOARD_KEYS], + "optional": False, + } + } + }, + ############################################################################################################ + { + "action_type": "WAIT", + "note": "wait until the next action", + }, + { + "action_type": "FAIL", + "note": "decide the task can not be performed", + }, + { + "action_type": "DONE", + "note": "decide the task is done", + } +] +Firstly you need to predict the class of your action, then you need to predict the parameters of your action: +- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080) +for example, format as: +``` +{ + "action_type": "MOUSE_MOVE", + "x": 1319.11, + "y": 65.06 +} +``` +- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse: +for example, format as: +``` +{ + "action_type": "CLICK", + "click_type": "LEFT" +} +``` +- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard +for example, format as: +``` +{ + "action_type": "KEY", + "key": "ctrl+c" +} +``` +- For TYPE, you need to specify the text you want to type +for example, format as: +``` +{ + "action_type": "TYPE", + "text": "hello world" +} +``` + +REMEMBER: +For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +You MUST wrap the dict with backticks (\`). +You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. +You CAN predict multiple actions at one step, but you should only return one action for each step. +""" \ No newline at end of file diff --git a/mm_agents/gpt_4_prompt_code.py b/mm_agents/gpt_4_prompt_code.py new file mode 100644 index 0000000..b057da6 --- /dev/null +++ b/mm_agents/gpt_4_prompt_code.py @@ -0,0 +1,18 @@ +SYS_PROMPT = """ +You are an agent which follow my instruction and perform desktop computer tasks as instructed. +You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard. +For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree. + +You are required to use `pyautogui` to perform the action. +Return one line or multiple lines of python code to perform the action each time, be time efficient. +You ONLY need to return the code inside a code block, like this: +```python +# your code here +``` +Specially, it is also allowed to return the following special code: +When you think you have to wait for some time, return ```WAIT```; +When you think the task can not be done, return ```FAIL```; +When you think the task is done, return ```DONE```. + +First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE. +""" \ No newline at end of file