Add 'WAIT', 'FAIL', 'DONE' to the action space; Debug basic prompting-based GPT-4 and Gemini agents; Initialize experiments script;
This commit is contained in:
@@ -1,12 +1,12 @@
|
||||
# fixme: Need to be rewrite on new action space
|
||||
|
||||
import os
|
||||
import re
|
||||
import base64
|
||||
from desktop_env.envs.desktop_env import Action, MouseClick
|
||||
import json
|
||||
import re
|
||||
from typing import Dict
|
||||
|
||||
import requests
|
||||
from mm_agents.gpt_4v_prompt import SYS_PROMPT
|
||||
|
||||
from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
|
||||
from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
|
||||
|
||||
|
||||
# Function to encode the image
|
||||
@@ -47,11 +47,26 @@ def parse_actions_from_string(input_string):
|
||||
raise ValueError("Invalid response format: " + input_string)
|
||||
|
||||
|
||||
def parse_code_from_string(input_string):
|
||||
# This regular expression will match both ```code``` and ```python code```
|
||||
# and capture the `code` part. It uses a non-greedy match for the content inside.
|
||||
pattern = r"```(?:\w+\s+)?(.*?)```"
|
||||
# Find all non-overlapping matches in the string
|
||||
matches = re.findall(pattern, input_string, re.DOTALL)
|
||||
|
||||
# The regex above captures the content inside the triple backticks.
|
||||
# The `re.DOTALL` flag allows the dot `.` to match newline characters as well,
|
||||
# so the code inside backticks can span multiple lines.
|
||||
|
||||
# matches now contains all the captured code snippets
|
||||
return matches
|
||||
|
||||
|
||||
class GPT4v_Agent:
|
||||
def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300):
|
||||
self.instruction = instruction
|
||||
def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
|
||||
self.model = model
|
||||
self.max_tokens = max_tokens
|
||||
self.action_space = action_space
|
||||
|
||||
self.headers = {
|
||||
"Content-Type": "application/json",
|
||||
@@ -64,20 +79,27 @@ class GPT4v_Agent:
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": SYS_PROMPT
|
||||
"text": {
|
||||
"computer_13": SYS_PROMPT_ACTION,
|
||||
"pyautogui": SYS_PROMPT_CODE
|
||||
}[action_space]
|
||||
},
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
def predict(self, obs):
|
||||
base64_image = encode_image(obs)
|
||||
def predict(self, obs: Dict):
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
base64_image = encode_image(obs["screenshot"])
|
||||
self.trajectory.append({
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "What's the next step for instruction '{}'?".format(self.instruction)
|
||||
"text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format(
|
||||
obs["instruction"])
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
@@ -87,12 +109,15 @@ class GPT4v_Agent:
|
||||
}
|
||||
]
|
||||
})
|
||||
|
||||
traj_to_show = []
|
||||
for i in range(len(self.trajectory)):
|
||||
traj_to_show.append(self.trajectory[i]["content"][0]["text"])
|
||||
if len(self.trajectory[i]["content"]) > 1:
|
||||
traj_to_show.append("screenshot_obs")
|
||||
|
||||
print("Trajectory:", traj_to_show)
|
||||
|
||||
payload = {
|
||||
"model": self.model,
|
||||
"messages": self.trajectory,
|
||||
@@ -103,6 +128,7 @@ class GPT4v_Agent:
|
||||
try:
|
||||
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
|
||||
except:
|
||||
# todo: add error handling
|
||||
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
|
||||
actions = None
|
||||
|
||||
@@ -120,7 +146,10 @@ class GPT4v_Agent:
|
||||
"""
|
||||
|
||||
# parse from the response
|
||||
actions = parse_actions_from_string(response)
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
elif self.action_space == "pyautogui":
|
||||
actions = parse_code_from_string(response)
|
||||
|
||||
# add action into the trajectory
|
||||
self.trajectory.append({
|
||||
@@ -133,34 +162,4 @@ class GPT4v_Agent:
|
||||
]
|
||||
})
|
||||
|
||||
# parse action
|
||||
parsed_actions = []
|
||||
for action in actions:
|
||||
parsed_action = {}
|
||||
action_type = Action[action['action_type']].value
|
||||
parsed_action["action_type"] = action_type
|
||||
|
||||
if action_type == Action.CLICK.value or action_type == Action.MOUSE_DOWN.value or action_type == Action.MOUSE_UP.value:
|
||||
parsed_action["click_type"] = MouseClick[action['click_type']].value
|
||||
|
||||
if action_type == Action.MOUSE_MOVE.value:
|
||||
parsed_action["x"] = action["x"]
|
||||
parsed_action["y"] = action["y"]
|
||||
|
||||
if action_type == Action.KEY.value:
|
||||
parsed_action["key"] = action["key"] # handle the condition of single key and multiple keys
|
||||
|
||||
if action_type == Action.TYPE.value:
|
||||
parsed_action["text"] = action["text"]
|
||||
|
||||
parsed_actions.append(parsed_action)
|
||||
|
||||
return parsed_actions
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# OpenAI API Key
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
|
||||
agent = GPT4v_Agent(api_key=api_key, instruction="Open Google Sheet")
|
||||
print(agent.predict(obs="stackoverflow.png"))
|
||||
return actions
|
||||
|
||||
Reference in New Issue
Block a user