Fix the width and height of vm, make agent perform more accurate

This commit is contained in:
Timothyxxx
2023-11-30 12:10:41 +08:00
parent ecb62d7eb4
commit e52ba2ab13
4 changed files with 134 additions and 62 deletions

View File

@@ -6,20 +6,24 @@ import json
import requests
from mm_agents.gpt_4v_prompt import SYS_PROMPT
# Function to encode the image
def encode_image(image_path):
with open(image_path, "rb") as image_file:
return base64.b64encode(image_file.read()).decode('utf-8')
def parse_action_from_string(input_string):
def parse_actions_from_string(input_string):
# Search for a JSON string within the input string
actions = []
matches = re.findall(r'```json\s+(.*?)\s+```', input_string, re.DOTALL)
if matches:
# Assuming there's only one match, parse the JSON string into a dictionary
try:
action_dict = json.loads(matches[0])
return action_dict
for match in matches:
action_dict = json.loads(match)
actions.append(action_dict)
return actions
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
@@ -27,17 +31,20 @@ def parse_action_from_string(input_string):
if matches:
# Assuming there's only one match, parse the JSON string into a dictionary
try:
action_dict = json.loads(matches[0])
return action_dict
for match in matches:
action_dict = json.loads(match)
actions.append(action_dict)
return actions
except json.JSONDecodeError as e:
return f"Failed to parse JSON: {e}"
else:
try:
action_dict = json.loads(input_string)
return action_dict
return [action_dict]
except json.JSONDecodeError as e:
raise ValueError("Invalid response format: " + input_string)
class GPT4v_Agent:
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
self.instruction = instruction
@@ -78,6 +85,10 @@ class GPT4v_Agent:
}
]
})
traj_to_show = []
for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["content"][0]["text"])
print("Trajectory:", traj_to_show)
payload = {
"model": self.model,
"messages": self.trajectory,
@@ -85,11 +96,15 @@ class GPT4v_Agent:
}
response = requests.post("https://api.openai.com/v1/chat/completions", headers=self.headers, json=payload)
action = self.parse_action(response.json()['choices'][0]['message']['content'])
try:
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
except:
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None
return action
return actions
def parse_action(self, response: str):
def parse_actions(self, response: str):
# response example
"""
```json
@@ -101,7 +116,7 @@ class GPT4v_Agent:
"""
# parse from the response
action = parse_action_from_string(response)
actions = parse_actions_from_string(response)
# add action into the trajectory
self.trajectory.append({
@@ -115,25 +130,28 @@ class GPT4v_Agent:
})
# parse action
parsed_action = {}
action_type = Action[action['action_type']].value
parsed_action["action_type"] = action_type
parsed_actions = []
for action in actions:
parsed_action = {}
action_type = Action[action['action_type']].value
parsed_action["action_type"] = action_type
if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
parsed_action["click_type"] = MouseClick[action['click_type']].value
if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
parsed_action["click_type"] = MouseClick[action['click_type']].value
if action_type == Action.MOUSE_MOVE.value:
parsed_action["x"] = action["x"]
parsed_action["y"] = action["y"]
if action_type == Action.MOUSE_MOVE.value:
parsed_action["x"] = action["x"]
parsed_action["y"] = action["y"]
# fixme: could these two actions be merged??
if action_type == Action.KEY.value:
parsed_action["key"] = [ord(c) for c in action["key"]]
if action_type == Action.KEY.value:
parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys
if action_type == Action.TYPE.value:
parsed_action["text"] = [ord(c) for c in action["text"]]
if action_type == Action.TYPE.value:
parsed_action["text"] = action["text"]
return parsed_action
parsed_actions.append(parsed_action)
return parsed_actions
if __name__ == '__main__':
@@ -142,4 +160,3 @@ if __name__ == '__main__':
agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
print(agent.predict(obs="stackoverflow.png"))

View File

@@ -31,25 +31,24 @@ for example, format as:
"click_type": "LEFT"
}
```
- For [KEY, KEY_DOWN, KEY_UP, TYPE], you need to choose a(multiple) key(s) from the keyboard
- For [KEY, KEY_DOWN, KEY_UP], you need to choose a(multiple) key(s) from the keyboard
for example, format as:
```
{
"action_type": "KEY",
"key": "ctrl+c"
}
```
- For TYPE, you need to specify the text you want to type
for example, format as:
```
{
"action_type": "TYPE",
"text": [
"w",
"i",
"k",
"i",
"p",
"e",
"d",
"i",
"a"
]
"text": "hello world"
}
```
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. You MUST wrap the dict with backticks (\`).
You can predict multiple actions at one step, but you should only return one action for each step.
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
"""