Minor updates

2024-01-16 12:15:21 +08:00
parent 6336a31419
commit 48a86d36cf
4 changed files with 401 additions and 1 deletions
--- a/README.md
+++ b/README.md
@@ -24,3 +24,10 @@ todo
 - [x] Add accessibility tree from the OS into the observation space
 - [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
 - [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
 ## Road map of benchmark, tools and resources (Proposed)
 - [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree
 - [ ] Annotate the steps of doing the task
 - [ ] Build a website for the project
 - [ ] Crawl all resources we explored from the internet, and make it easy to access
 - [ ] Set up ways for community to contribute new examples
--- a/experiment_pure_text.py
+++ b/experiment_pure_text.py
@@ -0,0 +1,131 @@
 import datetime
 import json
 import logging
 import os
 import sys
 from desktop_env.envs.desktop_env import DesktopEnv
 from mm_agents.gpt_4_agent import GPT4_Agent
 from mm_agents.gemini_pro_agent import GeminiPro_Agent
 #  Logger Configs {{{ # 
 logger = logging.getLogger()
 logger.setLevel(logging.DEBUG)
 datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
 file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
 debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
 stdout_handler = logging.StreamHandler(sys.stdout)
 sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
 file_handler.setLevel(logging.INFO)
 debug_handler.setLevel(logging.DEBUG)
 stdout_handler.setLevel(logging.INFO)
 sdebug_handler.setLevel(logging.DEBUG)
 formatter = logging.Formatter(
    fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
 file_handler.setFormatter(formatter)
 debug_handler.setFormatter(formatter)
 stdout_handler.setFormatter(formatter)
 sdebug_handler.setFormatter(formatter)
 stdout_handler.addFilter(logging.Filter("desktopenv"))
 sdebug_handler.addFilter(logging.Filter("desktopenv"))
 logger.addHandler(file_handler)
 logger.addHandler(debug_handler)
 logger.addHandler(stdout_handler)
 logger.addHandler(sdebug_handler)
 #  }}} Logger Configs # 
 logger = logging.getLogger("desktopenv.experiment")
 PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
    env = DesktopEnv(
        path_to_vm=PATH_TO_VM,
        action_space=agent.action_space,
        task_config=example
    )
    # reset the environment to certain snapshot
    observation = env.reset()
    done = False
    step_num = 0
    if recording:
        # send a request to the server to start recording
        env.controller.start_recording()
    while not done and step_num < max_steps:
        actions = agent.predict(observation)
        step_num += 1
        for action in actions:
            # Capture the timestamp before executing the action
            action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
            logger.info("Step %d: %s", step_num, action)
            observation, reward, done, info = env.step(action)
            logger.info("Reward: %.2f", reward)
            logger.info("Done: %s", done)
            logger.info("Info: %s", info)
            # Save screenshot and trajectory information
            with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
                with open(observation['screenshot'], "rb") as __f:
                    screenshot = __f.read()
                _f.write(screenshot)
            with open(trajectory_recording_path, "a") as f:
                f.write(json.dumps({
                    "step_num": step_num,
                    "action_timestamp": action_timestamp,
                    "action": action,
                    "reward": reward,
                    "done": done,
                    "info": info,
                    "screenshot_file": f"step_{step_num}_{action_timestamp}.png"
                }))
                f.write("\n")
            if done:
                logger.info("The episode is done.")
                break
    if recording:
        # send a request to the server to stop recording
        env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
    result = env.evaluate()
    logger.info("Result: %.2f", result)
    # env.close()
    logger.info("Environment closed.")
 if __name__ == "__main__":
    action_space = "pyautogui"
    example_class = "chrome"
    example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
        example = json.load(f)
    example["snapshot"] = "exp_setup2"
    # api_key = os.environ.get("OPENAI_API_KEY")
    # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
    api_key = os.environ.get("GENAI_API_KEY")
    agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
    root_trajectory_dir = "exp_trajectory"
    example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
    os.makedirs(example_trajectory_dir, exist_ok=True)
    run_one_example(example, agent, 10, example_trajectory_dir)
--- a/mm_agents/gpt_4_prompt_action.py
+++ b/mm_agents/gpt_4_prompt_action.py
@@ -0,0 +1,244 @@
 SYS_PROMPT = """
 You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
 For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
 HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
 ACTION_SPACE = [
    {
        "action_type": "MOVE_TO",
        "note": "move the cursor to the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": False,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "CLICK",
        "note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            },
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            },
            "num_clicks": {
                "type": int,
                "range": [1, 2, 3],
                "optional": True,
            },
        }
    },
    {
        "action_type": "MOUSE_DOWN",
        "note": "press the left button if the button not specified, otherwise press the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "MOUSE_UP",
        "note": "release the left button if the button not specified, otherwise release the specified button",
        "parameters": {
            "button": {
                "type": str,
                "range": ["left", "right", "middle"],
                "optional": True,
            }
        }
    },
    {
        "action_type": "RIGHT_CLICK",
        "note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DOUBLE_CLICK",
        "note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": True,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": True,
            }
        }
    },
    {
        "action_type": "DRAG_TO",
        "note": "drag the cursor to the specified position with the left button pressed",
        "parameters": {
            "x": {
                "type": float,
                "range": [0, X_MAX],
                "optional": False,
            },
            "y": {
                "type": float,
                "range": [0, Y_MAX],
                "optional": False,
            }
        }
    },
    {
        "action_type": "SCROLL",
        "note": "scroll the mouse wheel up or down",
        "parameters": {
            "dx": {
                "type": int,
                "range": None,
                "optional": False,
            },
            "dy": {
                "type": int,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "TYPING",
        "note": "type the specified text",
        "parameters": {
            "text": {
                "type": str,
                "range": None,
                "optional": False,
            }
        }
    },
    {
        "action_type": "PRESS",
        "note": "press the specified key and release it",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_DOWN",
        "note": "press the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "KEY_UP",
        "note": "release the specified key",
        "parameters": {
            "key": {
                "type": str,
                "range": KEYBOARD_KEYS,
                "optional": False,
            }
        }
    },
    {
        "action_type": "HOTKEY",
        "note": "press the specified key combination",
        "parameters": {
            "keys": {
                "type": list,
                "range": [KEYBOARD_KEYS],
                "optional": False,
            }
        }
    },
    ############################################################################################################
    {
        "action_type": "WAIT",
        "note": "wait until the next action",
    },
    {
        "action_type": "FAIL",
        "note": "decide the task can not be performed",
    },
    {
        "action_type": "DONE",
        "note": "decide the task is done",
    }
 ]
 Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
 - For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
 for example, format as:
 ```
 {
  "action_type": "MOUSE_MOVE",
  "x": 1319.11,
  "y": 65.06
 }
 ```
 - For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
 for example, format as:
 ```
 {
  "action_type": "CLICK",
  "click_type": "LEFT"
 }
 ```
 - For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
 for example, format as:
 ```
 {
  "action_type": "KEY",
  "key": "ctrl+c"
 }
 ```
 - For TYPE, you need to specify the text you want to type
 for example, format as:
 ```
 {
  "action_type": "TYPE",
  "text": "hello world"
 }
 ```
 REMEMBER:
 For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 You MUST wrap the dict with backticks (\`).
 You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
 You CAN predict multiple actions at one step, but you should only return one action for each step.
 """
--- a/mm_agents/gpt_4_prompt_code.py
+++ b/mm_agents/gpt_4_prompt_code.py
@@ -0,0 +1,18 @@
 SYS_PROMPT = """
 You are an agent which follow my instruction and perform desktop computer tasks as instructed.
 You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
 For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
 You are required to use `pyautogui` to perform the action. 
 Return one line or multiple lines of python code to perform the action each time, be time efficient.
 You ONLY need to return the code inside a code block, like this:
 ```python
 # your code here
 ```
 Specially, it is also allowed to return the following special code:
 When you think you have to wait for some time, return ```WAIT```;
 When you think the task can not be done, return ```FAIL```;
 When you think the task is done, return ```DONE```.
 First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
 """