diff --git a/mm_agents/prompts.py b/mm_agents/prompts.py
index f5c4b93..3969043 100644
--- a/mm_agents/prompts.py
+++ b/mm_agents/prompts.py
@@ -15,7 +15,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -36,7 +36,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
Our past communication is great, and what you have done is very helpful. I will now give you another task to complete.
First take a deep breath, think step by step, give the current screenshot a thinking, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -47,236 +47,236 @@ For each step, you will get an observation of an image, which is the screenshot
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
ACTION_SPACE = [
- {
+ {{
"action_type": "MOVE_TO",
"note": "move the cursor to the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- },
- "x": {
+ }},
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- },
- "num_clicks": {
+ }},
+ "num_clicks": {{
"type": int,
"range": [1, 2, 3],
"optional": True,
- },
- }
- },
- {
+ }},
+ }}
+ }},
+ {{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
- "parameters": {
- "dx": {
+ "parameters": {{
+ "dx": {{
"type": int,
"range": None,
"optional": False,
- },
- "dy": {
+ }},
+ "dy": {{
"type": int,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "TYPING",
"note": "type the specified text",
- "parameters": {
- "text": {
+ "parameters": {{
+ "text": {{
"type": str,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "PRESS",
"note": "press the specified key and release it",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_DOWN",
"note": "press the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_UP",
"note": "release the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "HOTKEY",
"note": "press the specified key combination",
- "parameters": {
- "keys": {
+ "parameters": {{
+ "keys": {{
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
- }
- }
- },
+ }}
+ }}
+ }},
############################################################################################################
- {
+ {{
"action_type": "WAIT",
"note": "wait until the next action",
- },
- {
+ }},
+ {{
"action_type": "FAIL",
"note": "decide the task can not be performed",
- },
- {
+ }},
+ {{
"action_type": "DONE",
"note": "decide the task is done",
- }
+ }}
]
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is ({SCREEN_WIDTH}, {SCREEN_HEIGHT})
for example, format as:
```
-{
+{{
"action_type": "MOUSE_MOVE",
"x": 1319.11,
"y": 65.06
-}
+}}
```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
for example, format as:
```
-{
+{{
"action_type": "CLICK",
"click_type": "LEFT"
-}
+}}
```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
-{
+{{
"action_type": "KEY",
"key": "ctrl+c"
-}
+}}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
-{
+{{
"action_type": "TYPE",
"text": "hello world"
-}
+}}
```
REMEMBER:
@@ -292,236 +292,236 @@ For each step, you will get an observation of an image, which is the screenshot
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
ACTION_SPACE = [
- {
+ {{
"action_type": "MOVE_TO",
"note": "move the cursor to the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- },
- "x": {
+ }},
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- },
- "num_clicks": {
+ }},
+ "num_clicks": {{
"type": int,
"range": [1, 2, 3],
"optional": True,
- },
- }
- },
- {
+ }},
+ }}
+ }},
+ {{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
- "parameters": {
- "dx": {
+ "parameters": {{
+ "dx": {{
"type": int,
"range": None,
"optional": False,
- },
- "dy": {
+ }},
+ "dy": {{
"type": int,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "TYPING",
"note": "type the specified text",
- "parameters": {
- "text": {
+ "parameters": {{
+ "text": {{
"type": str,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "PRESS",
"note": "press the specified key and release it",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_DOWN",
"note": "press the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_UP",
"note": "release the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "HOTKEY",
"note": "press the specified key combination",
- "parameters": {
- "keys": {
+ "parameters": {{
+ "keys": {{
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
- }
- }
- },
+ }}
+ }}
+ }},
############################################################################################################
- {
+ {{
"action_type": "WAIT",
"note": "wait until the next action",
- },
- {
+ }},
+ {{
"action_type": "FAIL",
"note": "decide the task can not be performed",
- },
- {
+ }},
+ {{
"action_type": "DONE",
"note": "decide the task is done",
- }
+ }}
]
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is ({SCREEN_WIDTH}, {SCREEN_HEIGHT})
for example, format as:
```
-{
+{{
"action_type": "MOUSE_MOVE",
"x": 1319.11,
"y": 65.06
-}
+}}
```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
for example, format as:
```
-{
+{{
"action_type": "CLICK",
"click_type": "LEFT"
-}
+}}
```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
-{
+{{
"action_type": "KEY",
"key": "ctrl+c"
-}
+}}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
-{
+{{
"action_type": "TYPE",
"text": "hello world"
-}
+}}
```
REMEMBER:
@@ -550,7 +550,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -560,236 +560,236 @@ For each step, you will get an observation of the desktop by accessibility tree,
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
ACTION_SPACE = [
- {
+ {{
"action_type": "MOVE_TO",
"note": "move the cursor to the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- },
- "x": {
+ }},
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- },
- "num_clicks": {
+ }},
+ "num_clicks": {{
"type": int,
"range": [1, 2, 3],
"optional": True,
- },
- }
- },
- {
+ }},
+ }}
+ }},
+ {{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
- "parameters": {
- "dx": {
+ "parameters": {{
+ "dx": {{
"type": int,
"range": None,
"optional": False,
- },
- "dy": {
+ }},
+ "dy": {{
"type": int,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "TYPING",
"note": "type the specified text",
- "parameters": {
- "text": {
+ "parameters": {{
+ "text": {{
"type": str,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "PRESS",
"note": "press the specified key and release it",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_DOWN",
"note": "press the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_UP",
"note": "release the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "HOTKEY",
"note": "press the specified key combination",
- "parameters": {
- "keys": {
+ "parameters": {{
+ "keys": {{
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
- }
- }
- },
+ }}
+ }}
+ }},
############################################################################################################
- {
+ {{
"action_type": "WAIT",
"note": "wait until the next action",
- },
- {
+ }},
+ {{
"action_type": "FAIL",
"note": "decide the task can not be performed",
- },
- {
+ }},
+ {{
"action_type": "DONE",
"note": "decide the task is done",
- }
+ }}
]
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is ({SCREEN_WIDTH}, {SCREEN_HEIGHT})
for example, format as:
```
-{
+{{
"action_type": "MOUSE_MOVE",
"x": 1319.11,
"y": 65.06
-}
+}}
```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
for example, format as:
```
-{
+{{
"action_type": "CLICK",
"click_type": "LEFT"
-}
+}}
```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
-{
+{{
"action_type": "KEY",
"key": "ctrl+c"
-}
+}}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
-{
+{{
"action_type": "TYPE",
"text": "hello world"
-}
+}}
```
REMEMBER:
@@ -817,7 +817,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -828,236 +828,236 @@ And you will predict the action of the computer based on the screenshot and acce
HERE is the description of the action space you need to predict, follow the format and choose the correct action type and parameters:
ACTION_SPACE = [
- {
+ {{
"action_type": "MOVE_TO",
"note": "move the cursor to the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "CLICK",
"note": "click the left button if the button not specified, otherwise click the specified button; click at the current position if x and y are not specified, otherwise click at the specified position",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- },
- "x": {
+ }},
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- },
- "num_clicks": {
+ }},
+ "num_clicks": {{
"type": int,
"range": [1, 2, 3],
"optional": True,
- },
- }
- },
- {
+ }},
+ }}
+ }},
+ {{
"action_type": "MOUSE_DOWN",
"note": "press the left button if the button not specified, otherwise press the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "MOUSE_UP",
"note": "release the left button if the button not specified, otherwise release the specified button",
- "parameters": {
- "button": {
+ "parameters": {{
+ "button": {{
"type": str,
"range": ["left", "right", "middle"],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "RIGHT_CLICK",
"note": "right click at the current position if x and y are not specified, otherwise right click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DOUBLE_CLICK",
"note": "double click at the current position if x and y are not specified, otherwise double click at the specified position",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": True,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": True,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "DRAG_TO",
"note": "drag the cursor to the specified position with the left button pressed",
- "parameters": {
- "x": {
+ "parameters": {{
+ "x": {{
"type": float,
"range": [0, X_MAX],
"optional": False,
- },
- "y": {
+ }},
+ "y": {{
"type": float,
"range": [0, Y_MAX],
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "SCROLL",
"note": "scroll the mouse wheel up or down",
- "parameters": {
- "dx": {
+ "parameters": {{
+ "dx": {{
"type": int,
"range": None,
"optional": False,
- },
- "dy": {
+ }},
+ "dy": {{
"type": int,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "TYPING",
"note": "type the specified text",
- "parameters": {
- "text": {
+ "parameters": {{
+ "text": {{
"type": str,
"range": None,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "PRESS",
"note": "press the specified key and release it",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_DOWN",
"note": "press the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "KEY_UP",
"note": "release the specified key",
- "parameters": {
- "key": {
+ "parameters": {{
+ "key": {{
"type": str,
"range": KEYBOARD_KEYS,
"optional": False,
- }
- }
- },
- {
+ }}
+ }}
+ }},
+ {{
"action_type": "HOTKEY",
"note": "press the specified key combination",
- "parameters": {
- "keys": {
+ "parameters": {{
+ "keys": {{
"type": list,
"range": [KEYBOARD_KEYS],
"optional": False,
- }
- }
- },
+ }}
+ }}
+ }},
############################################################################################################
- {
+ {{
"action_type": "WAIT",
"note": "wait until the next action",
- },
- {
+ }},
+ {{
"action_type": "FAIL",
"note": "decide the task can not be performed",
- },
- {
+ }},
+ {{
"action_type": "DONE",
"note": "decide the task is done",
- }
+ }}
]
Firstly you need to predict the class of your action, then you need to predict the parameters of your action:
-- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is (1920, 1080)
+- For MOUSE_MOVE, you need to predict the x and y coordinate of the mouse cursor, the left top corner of the screen is (0, 0), the right bottom corner of the screen is ({SCREEN_WIDTH}, {SCREEN_HEIGHT})
for example, format as:
```
-{
+{{
"action_type": "MOUSE_MOVE",
"x": 1319.11,
"y": 65.06
-}
+}}
```
- For [CLICK, MOUSE_DOWN, MOUSE_UP], you need to specify the click_type as well, select from [LEFT, MIDDLE, RIGHT, WHEEL_UP, WHEEL_DOWN], which means you click the left button, middle button, right button, wheel up or wheel down of your mouse:
for example, format as:
```
-{
+{{
"action_type": "CLICK",
"click_type": "LEFT"
-}
+}}
```
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
-{
+{{
"action_type": "KEY",
"key": "ctrl+c"
-}
+}}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
-{
+{{
"action_type": "TYPE",
"text": "hello world"
-}
+}}
```
REMEMBER:
@@ -1092,7 +1092,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -1104,7 +1104,7 @@ For each step, you will get an observation of an image, which is the screenshot
ACTION_DESCRIPTION_PROMPT_SEEACT = """
The text and image shown below is the observation of the desktop by 1) a screenshot; and 2) accessibility tree, which is based on AT-SPI library.
-{}
+{{}}
Follow the following guidance to think step by step before outlining the next action step at the current stage:
@@ -1142,7 +1142,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
"""
@@ -1168,7 +1168,7 @@ Here are some guidelines for you:
2. If a click action is needed, use only the following functions: pyautogui.click, pyautogui.rightClick or pyautogui.doubleClick.
3. Return ```Done``` when you think the task is done. Return ```Fail``` when you think the task can not be done.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
""".strip()
@@ -1177,10 +1177,10 @@ AGUVIS_SYS_PROMPT = """You are a GUI agent. You are given a task and a screensho
AGUVIS_PLANNING_PROMPT = """Please generate the next move according to the UI screenshot, instruction and previous actions.
-Instruction: {instruction}.
+Instruction: {{instruction}}.
Previous actions:
-{previous_actions}
+{{previous_actions}}
"""
AGUVIS_INNER_MONOLOGUE_APPEND_PROMPT = """<|recipient|>all
@@ -1188,11 +1188,11 @@ Action: """
AGUVIS_GROUNDING_PROMPT = """Please generate the next move according to the UI screenshot, instruction and previous actions.
-Instruction: {instruction}
+Instruction: {{instruction}}
"""
AGUVIS_GROUNDING_APPEND_PROMPT = """<|recipient|>os
-pyautogui.{function_name}"""
+pyautogui.{{function_name}}"""
UITARS_ACTION_SPACE = """
click(start_box='<|box_start|>(x1,y1)<|box_end|>')
@@ -1248,7 +1248,7 @@ wait() #Sleep for 5s and take a screenshot to check for any changes.
finished()
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
## User Instruction
-{instruction}
+{{instruction}}
"""
UITARS_USR_PROMPT_THOUGHT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
@@ -1260,14 +1260,14 @@ Action: ...
```
## Action Space
-{action_space}
+{{action_space}}
## Note
-- Use {language} in `Thought` part.
+- Use {{language}} in `Thought` part.
- Write a small plan and finally summarize your next action (with its target element) in one sentence in `Thought` part.
## User Instruction
-{instruction}
+{{instruction}}
"""
JEDI_GROUNDER_SYS_PROMPT = """You are a helpful assistant.
@@ -1278,12 +1278,12 @@ You may call one or more functions to assist with the user query.
You are provided with function signatures within XML tags:
-{{"type": "function", "function": {{"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {width}x{height}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {{"properties": {{"action": {{"description": "The action to perform. The available actions are:\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* `type`: Type a string of text on the keyboard.\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* `left_click`: Click the left mouse button.\n* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* `right_click`: Click the right mouse button.\n* `middle_click`: Click the middle mouse button.\n* `double_click`: Double-click the left mouse button.\n* `scroll`: Performs a scroll of the mouse scroll wheel.\n* `wait`: Wait specified seconds for the change to happen.\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}}, "keys": {{"description": "Required only by `action=key`.", "type": "array"}}, "text": {{"description": "Required only by `action=type`.", "type": "string"}}, "coordinate": {{"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move`, `action=left_click_drag`, `action=left_click`, `action=right_click`, `action=double_click`.", "type": "array"}}, "pixels": {{"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}}, "time": {{"description": "The seconds to wait. Required only by `action=wait`.", "type": "number"}}, "status": {{"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}}}, "required": ["action"], "type": "object"}}}}}}
+{{{{"type": "function", "function": {{{{"name": "computer_use", "description": "Use a mouse and keyboard to interact with a computer, and take screenshots.\n* This is an interface to a desktop GUI. You do not have access to a terminal or applications menu. You must click on desktop icons to start applications.\n* Some applications may take time to start or process actions, so you may need to wait and take successive screenshots to see the results of your actions. E.g. if you click on Firefox and a window doesn't open, try wait and taking another screenshot.\n* The screen's resolution is {{width}}x{{height}}.\n* Whenever you intend to move the cursor to click on an element like an icon, you should consult a screenshot to determine the coordinates of the element before moving the cursor.\n* If you tried clicking on a program or link but it failed to load, even after waiting, try adjusting your cursor position so that the tip of the cursor visually falls on the element that you want to click.\n* Make sure to click any buttons, links, icons, etc with the cursor tip in the center of the element. Don't click boxes on their edges unless asked.", "parameters": {{{{"properties": {{{{"action": {{{{"description": "The action to perform. The available actions are:\n* `key`: Performs key down presses on the arguments passed in order, then performs key releases in reverse order.\n* `type`: Type a string of text on the keyboard.\n* `mouse_move`: Move the cursor to a specified (x, y) pixel coordinate on the screen.\n* `left_click`: Click the left mouse button.\n* `left_click_drag`: Click and drag the cursor to a specified (x, y) pixel coordinate on the screen.\n* `right_click`: Click the right mouse button.\n* `middle_click`: Click the middle mouse button.\n* `double_click`: Double-click the left mouse button.\n* `scroll`: Performs a scroll of the mouse scroll wheel.\n* `wait`: Wait specified seconds for the change to happen.\n* `terminate`: Terminate the current task and report its completion status.", "enum": ["key", "type", "mouse_move", "left_click", "left_click_drag", "right_click", "middle_click", "double_click", "scroll", "wait", "terminate"], "type": "string"}}}}, "keys": {{{{"description": "Required only by `action=key`.", "type": "array"}}}}, "text": {{{{"description": "Required only by `action=type`.", "type": "string"}}}}, "coordinate": {{{{"description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates to move the mouse to. Required only by `action=mouse_move`, `action=left_click_drag`, `action=left_click`, `action=right_click`, `action=double_click`.", "type": "array"}}}}, "pixels": {{{{"description": "The amount of scrolling to perform. Positive values scroll up, negative values scroll down. Required only by `action=scroll`.", "type": "number"}}}}, "time": {{{{"description": "The seconds to wait. Required only by `action=wait`.", "type": "number"}}}}, "status": {{{{"description": "The status of the task. Required only by `action=terminate`.", "type": "string", "enum": ["success", "failure"]}}}}}}}}, "required": ["action"], "type": "object"}}}}}}}}}}}}
For each function call, return a json object with function name and arguments within XML tags:
-{{"name": , "arguments": }}
+{{{{"name": , "arguments": }}}}
"""
JEDI_PLANNER_SYS_PROMPT = """
@@ -1327,7 +1327,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-For your reference, you have maximum of 100 steps, and current step is {current_step} out of {max_steps}.
+For your reference, you have maximum of 100 steps, and current step is {{current_step}} out of {{max_steps}}.
If you are in the last step, you should return ```DONE``` or ```FAIL``` according to the result.
Here are some guidelines for you:
@@ -1335,7 +1335,7 @@ Here are some guidelines for you:
2. If a click action is needed, use only the following functions: pyautogui.click, pyautogui.rightClick or pyautogui.doubleClick.
3. Return ```Done``` when you think the task is done. Return ```Fail``` when you think the task can not be done.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR NEVER EVER RETURN ME ANYTHING ELSE.
"""
@@ -1405,7 +1405,7 @@ class Agent:
'''
def set_cell_values(self, cell_values: Dict[str, Any], app_name: str, sheet_name: str):
- '''Use this to set individual cell values in a spreadsheet. For example, setting A2 to "hello" would be done by passing {"A2": "hello"} as cell_values. The sheet must be opened before this command can be used.
+ '''Use this to set individual cell values in a spreadsheet. For example, setting A2 to "hello" would be done by passing {{"A2": "hello"}} as cell_values. The sheet must be opened before this command can be used.
Args:
cell_values: Dict[str, Any], A dictionary of cell values to set in the spreadsheet. The keys are the cell coordinates in the format "A1", "B2", etc.
Supported value types include: float, int, string, bool, formulas.
@@ -1464,7 +1464,7 @@ Remember you should only return ONE line of code, DO NOT RETURN more. You should
agent.click('Click \"Yes, I trust the authors\" button', 1, "left")
```
-For your reference, you have maximum of 100 steps, and current step is {current_step} out of {max_steps}.
+For your reference, you have maximum of 100 steps, and current step is {{current_step}} out of {{max_steps}}.
If you are in the last step, you should return ```agent.done()``` or ```agent.fail()``` according to the result.
Here are some guidelines for you:
@@ -1475,11 +1475,11 @@ Here are some guidelines for you:
5. Save modified files before returning ```agent.done()```. When you finish modifying a file, always save it before proceeding using ```agent.hotkey(['ctrl', 's'])``` or equivalent. Tasks may involve multiple files. Save each after finishing modification.
6. If you meet "Authentication required" prompt, you can continue to click "Cancel" to close it.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE I ASKED FOR NEVER EVER RETURN ME ANYTHING ELSE."""
GTA1_GROUNDING_SYSTEM_PROMPT = '''
-You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {height} and width {width}. For elements with area, return the center point.
+You are an expert UI element locator. Given a GUI image and a user's element description, provide the coordinates of the specified element as a single (x,y) point. The image resolution is height {{height}} and width {{width}}. For elements with area, return the center point.
Output the coordinate pair exactly:
(x,y)
@@ -1488,13 +1488,13 @@ Output the coordinate pair exactly:
GTA1_JUDGE_SYSTEM_PROMPT='''
You are an expert at evaluating the planning and reasoning of UI agents working toward achieving a goal.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights or login.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights or login.
Each time, I will provide you with:
-- The current screenshot of the UI of width {width} and height {height}
+- The current screenshot of the UI of width {{width}} and height {{height}}
- The goal of the task
- Past histories of planning and actions that have been taken
-- A list of {N_PLANNING} different planning approaches toward achieving the goal in the current state in this form:
+- A list of {{N_PLANNING}} different planning approaches toward achieving the goal in the current state in this form:
Observation:
Thought:
Action:
@@ -1511,10 +1511,10 @@ Note that some planning approaches may be similar - do not let the number of sim
Respond **only** with valid JSON (no extra keys or comments):
```json
-{{
+{{{{
"explaining": "Your explanation of why this planning is best using the evaluation criteria",
- "index": The index of the best planning (0, 1, ..., {N_INDEX})
-}}
+ "index": The index of the best planning (0, 1, ..., {{N_INDEX}})
+}}}}
```
'''.strip()
@@ -1559,7 +1559,7 @@ When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```, don't easily say ```FAIL```, try your best to do the task;
When you think the task is done, return ```DONE```.
-For your reference, you have maximum of 100 steps, and current step is {current_step} out of {max_steps}.
+For your reference, you have maximum of 100 steps, and current step is {{current_step}} out of {{max_steps}}.
If you are in the last step, you should return ```DONE``` or ```FAIL``` according to the result.
Here are some guidelines for you:
@@ -1567,6 +1567,6 @@ Here are some guidelines for you:
2. If a click action is needed, use only the following functions: pyautogui.click, pyautogui.rightClick or pyautogui.doubleClick.
3. Return ```Done``` when you think the task is done. Return ```Fail``` when you think the task can not be done.
-My computer's password is '{CLIENT_PASSWORD}', feel free to use it when you need sudo rights.
+My computer's password is '{{CLIENT_PASSWORD}}', feel free to use it when you need sudo rights.
First give the current screenshot and previous things we did a short reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR NEVER EVER RETURN ME ANYTHING ELSE.
"""
\ No newline at end of file