Minor updates
This commit is contained in:
@@ -24,3 +24,10 @@ todo
|
|||||||
- [x] Add accessibility tree from the OS into the observation space
|
- [x] Add accessibility tree from the OS into the observation space
|
||||||
- [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
|
- [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
|
||||||
- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
|
- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
|
||||||
|
|
||||||
|
## Road map of benchmark, tools and resources (Proposed)
|
||||||
|
- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree
|
||||||
|
- [ ] Annotate the steps of doing the task
|
||||||
|
- [ ] Build a website for the project
|
||||||
|
- [ ] Crawl all resources we explored from the internet, and make it easy to access
|
||||||
|
- [ ] Set up ways for community to contribute new examples
|
||||||
|
|||||||
131
experiment_pure_text.py
Normal file
131
experiment_pure_text.py
Normal file
@@ -0,0 +1,131 @@
|
|||||||
|
import datetime
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
|
||||||
|
from desktop_env.envs.desktop_env import DesktopEnv
|
||||||
|
from mm_agents.gpt_4_agent import GPT4_Agent
|
||||||
|
from mm_agents.gemini_pro_agent import GeminiPro_Agent
|
||||||
|
|
||||||
|
# Logger Configs {{{ #
|
||||||
|
logger = logging.getLogger()
|
||||||
|
logger.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||||
|
|
||||||
|
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||||
|
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||||
|
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||||
|
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||||
|
|
||||||
|
file_handler.setLevel(logging.INFO)
|
||||||
|
debug_handler.setLevel(logging.DEBUG)
|
||||||
|
stdout_handler.setLevel(logging.INFO)
|
||||||
|
sdebug_handler.setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
formatter = logging.Formatter(
|
||||||
|
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
|
||||||
|
file_handler.setFormatter(formatter)
|
||||||
|
debug_handler.setFormatter(formatter)
|
||||||
|
stdout_handler.setFormatter(formatter)
|
||||||
|
sdebug_handler.setFormatter(formatter)
|
||||||
|
|
||||||
|
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||||
|
sdebug_handler.addFilter(logging.Filter("desktopenv"))
|
||||||
|
|
||||||
|
logger.addHandler(file_handler)
|
||||||
|
logger.addHandler(debug_handler)
|
||||||
|
logger.addHandler(stdout_handler)
|
||||||
|
logger.addHandler(sdebug_handler)
|
||||||
|
# }}} Logger Configs #
|
||||||
|
|
||||||
|
logger = logging.getLogger("desktopenv.experiment")
|
||||||
|
|
||||||
|
PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
|
||||||
|
|
||||||
|
|
||||||
|
def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
|
||||||
|
trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
|
||||||
|
env = DesktopEnv(
|
||||||
|
path_to_vm=PATH_TO_VM,
|
||||||
|
action_space=agent.action_space,
|
||||||
|
task_config=example
|
||||||
|
)
|
||||||
|
# reset the environment to certain snapshot
|
||||||
|
observation = env.reset()
|
||||||
|
done = False
|
||||||
|
step_num = 0
|
||||||
|
|
||||||
|
if recording:
|
||||||
|
# send a request to the server to start recording
|
||||||
|
env.controller.start_recording()
|
||||||
|
|
||||||
|
while not done and step_num < max_steps:
|
||||||
|
actions = agent.predict(observation)
|
||||||
|
step_num += 1
|
||||||
|
for action in actions:
|
||||||
|
# Capture the timestamp before executing the action
|
||||||
|
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||||
|
logger.info("Step %d: %s", step_num, action)
|
||||||
|
|
||||||
|
observation, reward, done, info = env.step(action)
|
||||||
|
|
||||||
|
logger.info("Reward: %.2f", reward)
|
||||||
|
logger.info("Done: %s", done)
|
||||||
|
logger.info("Info: %s", info)
|
||||||
|
|
||||||
|
# Save screenshot and trajectory information
|
||||||
|
with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
|
||||||
|
with open(observation['screenshot'], "rb") as __f:
|
||||||
|
screenshot = __f.read()
|
||||||
|
_f.write(screenshot)
|
||||||
|
|
||||||
|
with open(trajectory_recording_path, "a") as f:
|
||||||
|
f.write(json.dumps({
|
||||||
|
"step_num": step_num,
|
||||||
|
"action_timestamp": action_timestamp,
|
||||||
|
"action": action,
|
||||||
|
"reward": reward,
|
||||||
|
"done": done,
|
||||||
|
"info": info,
|
||||||
|
"screenshot_file": f"step_{step_num}_{action_timestamp}.png"
|
||||||
|
}))
|
||||||
|
f.write("\n")
|
||||||
|
|
||||||
|
if done:
|
||||||
|
logger.info("The episode is done.")
|
||||||
|
break
|
||||||
|
|
||||||
|
if recording:
|
||||||
|
# send a request to the server to stop recording
|
||||||
|
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
|
||||||
|
|
||||||
|
result = env.evaluate()
|
||||||
|
logger.info("Result: %.2f", result)
|
||||||
|
|
||||||
|
# env.close()
|
||||||
|
logger.info("Environment closed.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
action_space = "pyautogui"
|
||||||
|
example_class = "chrome"
|
||||||
|
example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
|
||||||
|
|
||||||
|
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
|
||||||
|
example = json.load(f)
|
||||||
|
example["snapshot"] = "exp_setup2"
|
||||||
|
|
||||||
|
# api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
# agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
||||||
|
|
||||||
|
api_key = os.environ.get("GENAI_API_KEY")
|
||||||
|
agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
||||||
|
|
||||||
|
root_trajectory_dir = "exp_trajectory"
|
||||||
|
|
||||||
|
example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
|
||||||
|
os.makedirs(example_trajectory_dir, exist_ok=True)
|
||||||
|
|
||||||
|
run_one_example(example, agent, 10, example_trajectory_dir)
|
||||||
244
mm_agents/gpt_4_prompt_action.py
Normal file
244
mm_agents/gpt_4_prompt_action.py
Normal file
@@ -0,0 +1,244 @@
|
|||||||
|
SYS_PROMPT = """
|
||||||
|
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
|
||||||
|
For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
|
||||||
|
|
||||||
|
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
|
||||||
|
ACTION_SPACE = [
|
||||||
|
{
|
||||||
|
"action_type": "MOVE_TO",
|
||||||
|
"note": "move the cursor to the specified position",
|
||||||
|
"parameters": {
|
||||||
|
"x": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, X_MAX],
|
||||||
|
"optional": False,
|
||||||
|
},
|
||||||
|
"y": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, Y_MAX],
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "CLICK",
|
||||||
|
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
|
||||||
|
"parameters": {
|
||||||
|
"button": {
|
||||||
|
"type": str,
|
||||||
|
"range": ["left", "right", "middle"],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
"x": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, X_MAX],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
"y": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, Y_MAX],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
"num_clicks": {
|
||||||
|
"type": int,
|
||||||
|
"range": [1, 2, 3],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "MOUSE_DOWN",
|
||||||
|
"note": "press the left button if the button not specified, otherwise press the specified button",
|
||||||
|
"parameters": {
|
||||||
|
"button": {
|
||||||
|
"type": str,
|
||||||
|
"range": ["left", "right", "middle"],
|
||||||
|
"optional": True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "MOUSE_UP",
|
||||||
|
"note": "release the left button if the button not specified, otherwise release the specified button",
|
||||||
|
"parameters": {
|
||||||
|
"button": {
|
||||||
|
"type": str,
|
||||||
|
"range": ["left", "right", "middle"],
|
||||||
|
"optional": True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "RIGHT_CLICK",
|
||||||
|
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
|
||||||
|
"parameters": {
|
||||||
|
"x": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, X_MAX],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
"y": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, Y_MAX],
|
||||||
|
"optional": True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "DOUBLE_CLICK",
|
||||||
|
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
|
||||||
|
"parameters": {
|
||||||
|
"x": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, X_MAX],
|
||||||
|
"optional": True,
|
||||||
|
},
|
||||||
|
"y": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, Y_MAX],
|
||||||
|
"optional": True,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "DRAG_TO",
|
||||||
|
"note": "drag the cursor to the specified position with the left button pressed",
|
||||||
|
"parameters": {
|
||||||
|
"x": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, X_MAX],
|
||||||
|
"optional": False,
|
||||||
|
},
|
||||||
|
"y": {
|
||||||
|
"type": float,
|
||||||
|
"range": [0, Y_MAX],
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "SCROLL",
|
||||||
|
"note": "scroll the mouse wheel up or down",
|
||||||
|
"parameters": {
|
||||||
|
"dx": {
|
||||||
|
"type": int,
|
||||||
|
"range": None,
|
||||||
|
"optional": False,
|
||||||
|
},
|
||||||
|
"dy": {
|
||||||
|
"type": int,
|
||||||
|
"range": None,
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "TYPING",
|
||||||
|
"note": "type the specified text",
|
||||||
|
"parameters": {
|
||||||
|
"text": {
|
||||||
|
"type": str,
|
||||||
|
"range": None,
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "PRESS",
|
||||||
|
"note": "press the specified key and release it",
|
||||||
|
"parameters": {
|
||||||
|
"key": {
|
||||||
|
"type": str,
|
||||||
|
"range": KEYBOARD_KEYS,
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "KEY_DOWN",
|
||||||
|
"note": "press the specified key",
|
||||||
|
"parameters": {
|
||||||
|
"key": {
|
||||||
|
"type": str,
|
||||||
|
"range": KEYBOARD_KEYS,
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "KEY_UP",
|
||||||
|
"note": "release the specified key",
|
||||||
|
"parameters": {
|
||||||
|
"key": {
|
||||||
|
"type": str,
|
||||||
|
"range": KEYBOARD_KEYS,
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "HOTKEY",
|
||||||
|
"note": "press the specified key combination",
|
||||||
|
"parameters": {
|
||||||
|
"keys": {
|
||||||
|
"type": list,
|
||||||
|
"range": [KEYBOARD_KEYS],
|
||||||
|
"optional": False,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
############################################################################################################
|
||||||
|
{
|
||||||
|
"action_type": "WAIT",
|
||||||
|
"note": "wait until the next action",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "FAIL",
|
||||||
|
"note": "decide the task can not be performed",
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"action_type": "DONE",
|
||||||
|
"note": "decide the task is done",
|
||||||
|
}
|
||||||
|
]
|
||||||
|
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
|
||||||
|
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
|
||||||
|
for example, format as:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"action_type": "MOUSE_MOVE",
|
||||||
|
"x": 1319.11,
|
||||||
|
"y": 65.06
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
|
||||||
|
for example, format as:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"action_type": "CLICK",
|
||||||
|
"click_type": "LEFT"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
|
||||||
|
for example, format as:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"action_type": "KEY",
|
||||||
|
"key": "ctrl+c"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
- For TYPE, you need to specify the text you want to type
|
||||||
|
for example, format as:
|
||||||
|
```
|
||||||
|
{
|
||||||
|
"action_type": "TYPE",
|
||||||
|
"text": "hello world"
|
||||||
|
}
|
||||||
|
```
|
||||||
|
|
||||||
|
REMEMBER:
|
||||||
|
For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||||
|
You MUST wrap the dict with backticks (\`).
|
||||||
|
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
|
||||||
|
You CAN predict multiple actions at one step, but you should only return one action for each step.
|
||||||
|
"""
|
||||||
18
mm_agents/gpt_4_prompt_code.py
Normal file
18
mm_agents/gpt_4_prompt_code.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
SYS_PROMPT = """
|
||||||
|
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
|
||||||
|
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
|
||||||
|
For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
|
||||||
|
|
||||||
|
You are required to use `pyautogui` to perform the action.
|
||||||
|
Return one line or multiple lines of python code to perform the action each time, be time efficient.
|
||||||
|
You ONLY need to return the code inside a code block, like this:
|
||||||
|
```python
|
||||||
|
# your code here
|
||||||
|
```
|
||||||
|
Specially, it is also allowed to return the following special code:
|
||||||
|
When you think you have to wait for some time, return ```WAIT```;
|
||||||
|
When you think the task can not be done, return ```FAIL```;
|
||||||
|
When you think the task is done, return ```DONE```.
|
||||||
|
|
||||||
|
First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||||
|
"""
|
||||||
Reference in New Issue
Block a user