Minor updates
This commit is contained in:
@@ -23,4 +23,11 @@ todo
|
||||
- [x] Error handling during file passing and file opening, etc.
|
||||
- [x] Add accessibility tree from the OS into the observation space
|
||||
- [ ] Add pre-process and post-process action support for benchmarking setup and evaluation
|
||||
- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
|
||||
- [ ] Multiprocess support, this can enable the reinforcement learning to be more efficient
|
||||
|
||||
## Road map of benchmark, tools and resources (Proposed)
|
||||
- [ ] Improve the annotation tool base on DuckTrack, make it more robust which align on accessibility tree
|
||||
- [ ] Annotate the steps of doing the task
|
||||
- [ ] Build a website for the project
|
||||
- [ ] Crawl all resources we explored from the internet, and make it easy to access
|
||||
- [ ] Set up ways for community to contribute new examples
|
||||
|
||||
131
experiment_pure_text.py
Normal file
131
experiment_pure_text.py
Normal file
@@ -0,0 +1,131 @@
|
||||
import datetime
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
|
||||
from desktop_env.envs.desktop_env import DesktopEnv
|
||||
from mm_agents.gpt_4_agent import GPT4_Agent
|
||||
from mm_agents.gemini_pro_agent import GeminiPro_Agent
|
||||
|
||||
# Logger Configs {{{ #
|
||||
logger = logging.getLogger()
|
||||
logger.setLevel(logging.DEBUG)
|
||||
|
||||
datetime_str: str = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
|
||||
file_handler = logging.FileHandler(os.path.join("logs", "normal-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
debug_handler = logging.FileHandler(os.path.join("logs", "debug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
stdout_handler = logging.StreamHandler(sys.stdout)
|
||||
sdebug_handler = logging.FileHandler(os.path.join("logs", "sdebug-{:}.log".format(datetime_str)), encoding="utf-8")
|
||||
|
||||
file_handler.setLevel(logging.INFO)
|
||||
debug_handler.setLevel(logging.DEBUG)
|
||||
stdout_handler.setLevel(logging.INFO)
|
||||
sdebug_handler.setLevel(logging.DEBUG)
|
||||
|
||||
formatter = logging.Formatter(
|
||||
fmt="\x1b[1;33m[%(asctime)s \x1b[31m%(levelname)s \x1b[32m%(module)s/%(lineno)d-%(processName)s\x1b[1;33m] \x1b[0m%(message)s")
|
||||
file_handler.setFormatter(formatter)
|
||||
debug_handler.setFormatter(formatter)
|
||||
stdout_handler.setFormatter(formatter)
|
||||
sdebug_handler.setFormatter(formatter)
|
||||
|
||||
stdout_handler.addFilter(logging.Filter("desktopenv"))
|
||||
sdebug_handler.addFilter(logging.Filter("desktopenv"))
|
||||
|
||||
logger.addHandler(file_handler)
|
||||
logger.addHandler(debug_handler)
|
||||
logger.addHandler(stdout_handler)
|
||||
logger.addHandler(sdebug_handler)
|
||||
# }}} Logger Configs #
|
||||
|
||||
logger = logging.getLogger("desktopenv.experiment")
|
||||
|
||||
PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
|
||||
|
||||
|
||||
def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
|
||||
trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
|
||||
env = DesktopEnv(
|
||||
path_to_vm=PATH_TO_VM,
|
||||
action_space=agent.action_space,
|
||||
task_config=example
|
||||
)
|
||||
# reset the environment to certain snapshot
|
||||
observation = env.reset()
|
||||
done = False
|
||||
step_num = 0
|
||||
|
||||
if recording:
|
||||
# send a request to the server to start recording
|
||||
env.controller.start_recording()
|
||||
|
||||
while not done and step_num < max_steps:
|
||||
actions = agent.predict(observation)
|
||||
step_num += 1
|
||||
for action in actions:
|
||||
# Capture the timestamp before executing the action
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_num, action)
|
||||
|
||||
observation, reward, done, info = env.step(action)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
logger.info("Info: %s", info)
|
||||
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_trajectory_dir, f"step_{step_num}_{action_timestamp}.png"), "wb") as _f:
|
||||
with open(observation['screenshot'], "rb") as __f:
|
||||
screenshot = __f.read()
|
||||
_f.write(screenshot)
|
||||
|
||||
with open(trajectory_recording_path, "a") as f:
|
||||
f.write(json.dumps({
|
||||
"step_num": step_num,
|
||||
"action_timestamp": action_timestamp,
|
||||
"action": action,
|
||||
"reward": reward,
|
||||
"done": done,
|
||||
"info": info,
|
||||
"screenshot_file": f"step_{step_num}_{action_timestamp}.png"
|
||||
}))
|
||||
f.write("\n")
|
||||
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
|
||||
if recording:
|
||||
# send a request to the server to stop recording
|
||||
env.controller.end_recording(os.path.join(example_trajectory_dir, "recording.mp4"))
|
||||
|
||||
result = env.evaluate()
|
||||
logger.info("Result: %.2f", result)
|
||||
|
||||
# env.close()
|
||||
logger.info("Environment closed.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
action_space = "pyautogui"
|
||||
example_class = "chrome"
|
||||
example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
|
||||
|
||||
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
|
||||
example = json.load(f)
|
||||
example["snapshot"] = "exp_setup2"
|
||||
|
||||
# api_key = os.environ.get("OPENAI_API_KEY")
|
||||
# agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
||||
|
||||
api_key = os.environ.get("GENAI_API_KEY")
|
||||
agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
||||
|
||||
root_trajectory_dir = "exp_trajectory"
|
||||
|
||||
example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
|
||||
os.makedirs(example_trajectory_dir, exist_ok=True)
|
||||
|
||||
run_one_example(example, agent, 10, example_trajectory_dir)
|
||||
244
mm_agents/gpt_4_prompt_action.py
Normal file
244
mm_agents/gpt_4_prompt_action.py
Normal file
@@ -0,0 +1,244 @@
|
||||
SYS_PROMPT = """
|
||||
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection.
|
||||
For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
|
||||
|
||||
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
|
||||
ACTION_SPACE = [
|
||||
{
|
||||
"action_type": "MOVE_TO",
|
||||
"note": "move the cursor to the specified position",
|
||||
"parameters": {
|
||||
"x": {
|
||||
"type": float,
|
||||
"range": [0, X_MAX],
|
||||
"optional": False,
|
||||
},
|
||||
"y": {
|
||||
"type": float,
|
||||
"range": [0, Y_MAX],
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "CLICK",
|
||||
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
|
||||
"parameters": {
|
||||
"button": {
|
||||
"type": str,
|
||||
"range": ["left", "right", "middle"],
|
||||
"optional": True,
|
||||
},
|
||||
"x": {
|
||||
"type": float,
|
||||
"range": [0, X_MAX],
|
||||
"optional": True,
|
||||
},
|
||||
"y": {
|
||||
"type": float,
|
||||
"range": [0, Y_MAX],
|
||||
"optional": True,
|
||||
},
|
||||
"num_clicks": {
|
||||
"type": int,
|
||||
"range": [1, 2, 3],
|
||||
"optional": True,
|
||||
},
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "MOUSE_DOWN",
|
||||
"note": "press the left button if the button not specified, otherwise press the specified button",
|
||||
"parameters": {
|
||||
"button": {
|
||||
"type": str,
|
||||
"range": ["left", "right", "middle"],
|
||||
"optional": True,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "MOUSE_UP",
|
||||
"note": "release the left button if the button not specified, otherwise release the specified button",
|
||||
"parameters": {
|
||||
"button": {
|
||||
"type": str,
|
||||
"range": ["left", "right", "middle"],
|
||||
"optional": True,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "RIGHT_CLICK",
|
||||
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
|
||||
"parameters": {
|
||||
"x": {
|
||||
"type": float,
|
||||
"range": [0, X_MAX],
|
||||
"optional": True,
|
||||
},
|
||||
"y": {
|
||||
"type": float,
|
||||
"range": [0, Y_MAX],
|
||||
"optional": True,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "DOUBLE_CLICK",
|
||||
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
|
||||
"parameters": {
|
||||
"x": {
|
||||
"type": float,
|
||||
"range": [0, X_MAX],
|
||||
"optional": True,
|
||||
},
|
||||
"y": {
|
||||
"type": float,
|
||||
"range": [0, Y_MAX],
|
||||
"optional": True,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "DRAG_TO",
|
||||
"note": "drag the cursor to the specified position with the left button pressed",
|
||||
"parameters": {
|
||||
"x": {
|
||||
"type": float,
|
||||
"range": [0, X_MAX],
|
||||
"optional": False,
|
||||
},
|
||||
"y": {
|
||||
"type": float,
|
||||
"range": [0, Y_MAX],
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "SCROLL",
|
||||
"note": "scroll the mouse wheel up or down",
|
||||
"parameters": {
|
||||
"dx": {
|
||||
"type": int,
|
||||
"range": None,
|
||||
"optional": False,
|
||||
},
|
||||
"dy": {
|
||||
"type": int,
|
||||
"range": None,
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "TYPING",
|
||||
"note": "type the specified text",
|
||||
"parameters": {
|
||||
"text": {
|
||||
"type": str,
|
||||
"range": None,
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "PRESS",
|
||||
"note": "press the specified key and release it",
|
||||
"parameters": {
|
||||
"key": {
|
||||
"type": str,
|
||||
"range": KEYBOARD_KEYS,
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "KEY_DOWN",
|
||||
"note": "press the specified key",
|
||||
"parameters": {
|
||||
"key": {
|
||||
"type": str,
|
||||
"range": KEYBOARD_KEYS,
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "KEY_UP",
|
||||
"note": "release the specified key",
|
||||
"parameters": {
|
||||
"key": {
|
||||
"type": str,
|
||||
"range": KEYBOARD_KEYS,
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"action_type": "HOTKEY",
|
||||
"note": "press the specified key combination",
|
||||
"parameters": {
|
||||
"keys": {
|
||||
"type": list,
|
||||
"range": [KEYBOARD_KEYS],
|
||||
"optional": False,
|
||||
}
|
||||
}
|
||||
},
|
||||
############################################################################################################
|
||||
{
|
||||
"action_type": "WAIT",
|
||||
"note": "wait until the next action",
|
||||
},
|
||||
{
|
||||
"action_type": "FAIL",
|
||||
"note": "decide the task can not be performed",
|
||||
},
|
||||
{
|
||||
"action_type": "DONE",
|
||||
"note": "decide the task is done",
|
||||
}
|
||||
]
|
||||
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
|
||||
- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
"action_type": "MOUSE_MOVE",
|
||||
"x": 1319.11,
|
||||
"y": 65.06
|
||||
}
|
||||
```
|
||||
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
"action_type": "CLICK",
|
||||
"click_type": "LEFT"
|
||||
}
|
||||
```
|
||||
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
"action_type": "KEY",
|
||||
"key": "ctrl+c"
|
||||
}
|
||||
```
|
||||
- For TYPE, you need to specify the text you want to type
|
||||
for example, format as:
|
||||
```
|
||||
{
|
||||
"action_type": "TYPE",
|
||||
"text": "hello world"
|
||||
}
|
||||
```
|
||||
|
||||
REMEMBER:
|
||||
For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
You MUST wrap the dict with backticks (\`).
|
||||
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
|
||||
You CAN predict multiple actions at one step, but you should only return one action for each step.
|
||||
"""
|
||||
18
mm_agents/gpt_4_prompt_code.py
Normal file
18
mm_agents/gpt_4_prompt_code.py
Normal file
@@ -0,0 +1,18 @@
|
||||
SYS_PROMPT = """
|
||||
You are an agent which follow my instruction and perform desktop computer tasks as instructed.
|
||||
You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
|
||||
For each step, you will get an observation of the desktop by the XML format of accessibility tree, which is based on AT-SPI library. And you will predict the action of the computer based on the accessibility tree.
|
||||
|
||||
You are required to use `pyautogui` to perform the action.
|
||||
Return one line or multiple lines of python code to perform the action each time, be time efficient.
|
||||
You ONLY need to return the code inside a code block, like this:
|
||||
```python
|
||||
# your code here
|
||||
```
|
||||
Specially, it is also allowed to return the following special code:
|
||||
When you think you have to wait for some time, return ```WAIT```;
|
||||
When you think the task can not be done, return ```FAIL```;
|
||||
When you think the task is done, return ```DONE```.
|
||||
|
||||
First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
|
||||
"""
|
||||
Reference in New Issue
Block a user