Add gemini agent implementation; Add missed requirements; Minor fix some small bugs

This commit is contained in:
Timothyxxx
2024-01-15 21:58:33 +08:00
parent c68796e842
commit 493b719821
10 changed files with 82 additions and 83 deletions

View File

@@ -231,8 +231,11 @@ class DesktopEnv(gym.Env):
# the set of all possible actions defined in the action representation # the set of all possible actions defined in the action representation
self.controller.execute_action(action) self.controller.execute_action(action)
elif self.action_space == "pyautogui": elif self.action_space == "pyautogui":
# the set of all possible python commands insides `pyautogui` if action in ['WAIT', 'FAIL', 'DONE']:
self.controller.execute_python_command(action) self.controller.execute_action(action)
else:
# the set of all possible python commands insides `pyautogui`
self.controller.execute_python_command(action)
observation = { observation = {
"screenshot": self._get_obs(), "screenshot": self._get_obs(),

View File

@@ -6,6 +6,7 @@ import sys
from desktop_env.envs.desktop_env import DesktopEnv from desktop_env.envs.desktop_env import DesktopEnv
from mm_agents.gpt_4v_agent import GPT4v_Agent from mm_agents.gpt_4v_agent import GPT4v_Agent
from mm_agents.gemini_agent import GeminiPro_Agent
# Logger Configs {{{ # # Logger Configs {{{ #
logger = logging.getLogger() logger = logging.getLogger()
@@ -44,7 +45,7 @@ logger = logging.getLogger("desktopenv.experiment")
PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_trajectory", recording=True): def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
env = DesktopEnv( env = DesktopEnv(
path_to_vm=PATH_TO_VM, path_to_vm=PATH_TO_VM,
@@ -53,7 +54,6 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
) )
# reset the environment to certain snapshot # reset the environment to certain snapshot
observation = env.reset() observation = env.reset()
observation['instruction'] = example['instruction']
done = False done = False
step_num = 0 step_num = 0
@@ -63,17 +63,14 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
while not done and step_num < max_steps: while not done and step_num < max_steps:
actions = agent.predict(observation) actions = agent.predict(observation)
step_num += 1
for action in actions: for action in actions:
step_num += 1
# Capture the timestamp before executing the action # Capture the timestamp before executing the action
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
logger.info("Step %d: %s", step_num, action)
observation, reward, done, info = env.step(action) observation, reward, done, info = env.step(action)
observation['instruction'] = example['instruction']
# Logging
logger.info("Step %d: %s", step_num, action)
logger.info("Reward: %.2f", reward) logger.info("Reward: %.2f", reward)
logger.info("Done: %s", done) logger.info("Done: %s", done)
logger.info("Info: %s", info) logger.info("Info: %s", info)
@@ -114,19 +111,22 @@ def run_one_example(example, agent, max_steps=2, example_trajectory_dir="exp_tra
if __name__ == "__main__": if __name__ == "__main__":
action_space = "pyautogui" action_space = "pyautogui"
example_class = "vlc" example_class = "thunderbird"
example_id = "8f080098-ddb1-424c-b438-4e96e5e4786e" example_id = "bb5e4c0d-f964-439c-97b6-bdb9747de3f4"
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
example = json.load(f) example = json.load(f)
example["snapshot"] = "exp_setup" example["snapshot"] = "exp_setup2"
api_key = os.environ.get("OPENAI_API_KEY") # api_key = os.environ.get("OPENAI_API_KEY")
agent = GPT4v_Agent(api_key=api_key, action_space=action_space) # agent = GPT4v_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
api_key = os.environ.get("GENAI_API_KEY")
agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
root_trajectory_dir = "exp_trajectory" root_trajectory_dir = "exp_trajectory"
example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
os.makedirs(example_trajectory_dir, exist_ok=True) os.makedirs(example_trajectory_dir, exist_ok=True)
run_one_example(example, agent, 2, example_trajectory_dir) run_one_example(example, agent, 10, example_trajectory_dir)

View File

@@ -1,20 +0,0 @@
from transformers import FuyuProcessor, FuyuForCausalLM
from PIL import Image
image = Image.open("stackoverflow.png").convert("RGB")
# load model and processor
model_id = "adept/fuyu-8b"
processor = FuyuProcessor.from_pretrained(model_id)
model = FuyuForCausalLM.from_pretrained(model_id, device_map="cuda:0")
# prepare inputs for the model
text_prompt = "Description:\n"
inputs = processor(text=text_prompt, images=image, return_tensors="pt").to("cuda:0")
# autoregressively generate text
generation_output = model.generate(**inputs, max_new_tokens=100)
generation_text = processor.batch_decode(generation_output[:, -100:], skip_special_tokens=True)
print(generation_text)

View File

@@ -1,4 +1,4 @@
from typing import Dict from typing import Dict, List
import PIL.Image import PIL.Image
import google.generativeai as genai import google.generativeai as genai
@@ -9,10 +9,13 @@ from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
class GeminiPro_Agent: class GeminiPro_Agent:
def __init__(self, api_key, model='gemini-pro-vision', max_tokens=300, action_space="computer_13"): def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
genai.configure(api_key) action_space="computer_13"):
genai.configure(api_key=api_key)
self.instruction = instruction
self.model = genai.GenerativeModel(model) self.model = genai.GenerativeModel(model)
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.temperature = temperature
self.action_space = action_space self.action_space = action_space
self.trajectory = [ self.trajectory = [
@@ -22,22 +25,39 @@ class GeminiPro_Agent:
{ {
"computer_13": SYS_PROMPT_ACTION, "computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE "pyautogui": SYS_PROMPT_CODE
}[action_space] }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
] ]
} }
] ]
def predict(self, obs: Dict): def predict(self, obs: Dict) -> List:
""" """
Predict the next action(s) based on the current observation. Predict the next action(s) based on the current observation.
Only support single-round conversation, only fill-in the last desktop screenshot.
""" """
img = PIL.Image.open(obs["screenshot"]) img = PIL.Image.open(obs["screenshot"])
self.trajectory.append({ self.trajectory.append({
"role": "user", "role": "user",
"parts": ["To accomplish the task '{}' and given the current screenshot, what's the next step?".format( "parts": ["What's the next step that you will do to help with the task?", img]
obs["instruction"]), img]
}) })
# todo: Remove this step once the Gemini supports multi-round conversation
all_message_str = ""
for i in range(len(self.trajectory)):
if i == 0:
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
elif i % 2 == 1:
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
else:
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
message_for_gemini = {
"role": "user",
"parts": [all_message_str, img]
}
traj_to_show = [] traj_to_show = []
for i in range(len(self.trajectory)): for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["parts"][0]) traj_to_show.append(self.trajectory[i]["parts"][0])
@@ -46,29 +66,28 @@ class GeminiPro_Agent:
print("Trajectory:", traj_to_show) print("Trajectory:", traj_to_show)
response = self.model.generate_content(self.trajectory, max_tokens=self.max_tokens) response = self.model.generate_content(
message_for_gemini,
generation_config={
"max_output_tokens": self.max_tokens,
"temperature": self.temperature
}
)
try: try:
# fixme: change to fit the new response format from gemini pro response_text = response.text
actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
except: except:
# todo: add error handling return []
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None try:
actions = self.parse_actions(response_text)
except:
print("Failed to parse action from response:", response_text)
actions = []
return actions return actions
def parse_actions(self, response: str): def parse_actions(self, response: str):
# response example
"""
```json
{
"action_type": "CLICK",
"click_type": "RIGHT"
}
```
"""
# parse from the response # parse from the response
if self.action_space == "computer_13": if self.action_space == "computer_13":
actions = parse_actions_from_string(response) actions = parse_actions_from_string(response)

View File

View File

@@ -1,7 +1,7 @@
import base64 import base64
import json import json
import re import re
from typing import Dict from typing import Dict, List
import requests import requests
@@ -63,7 +63,8 @@ def parse_code_from_string(input_string):
class GPT4v_Agent: class GPT4v_Agent:
def __init__(self, api_key, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"): def __init__(self, api_key, instruction, model="gpt-4-vision-preview", max_tokens=300, action_space="computer_13"):
self.instruction = instruction
self.model = model self.model = model
self.max_tokens = max_tokens self.max_tokens = max_tokens
self.action_space = action_space self.action_space = action_space
@@ -82,13 +83,13 @@ class GPT4v_Agent:
"text": { "text": {
"computer_13": SYS_PROMPT_ACTION, "computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE "pyautogui": SYS_PROMPT_CODE
}[action_space] }[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
}, },
] ]
} }
] ]
def predict(self, obs: Dict): def predict(self, obs: Dict) -> List:
""" """
Predict the next action(s) based on the current observation. Predict the next action(s) based on the current observation.
""" """
@@ -98,8 +99,7 @@ class GPT4v_Agent:
"content": [ "content": [
{ {
"type": "text", "type": "text",
"text": "To accomplish the task '{}' and given the current screenshot, what's the next step?".format( "text": "What's the next step that you will do to help with the task?"
obs["instruction"])
}, },
{ {
"type": "image_url", "type": "image_url",
@@ -128,23 +128,12 @@ class GPT4v_Agent:
try: try:
actions = self.parse_actions(response.json()['choices'][0]['message']['content']) actions = self.parse_actions(response.json()['choices'][0]['message']['content'])
except: except:
# todo: add error handling print("Failed to parse action from response:", response.json())
print("Failed to parse action from response:", response.json()['choices'][0]['message']['content'])
actions = None actions = None
return actions return actions
def parse_actions(self, response: str): def parse_actions(self, response: str):
# response example
"""
```json
{
"action_type": "CLICK",
"click_type": "RIGHT"
}
```
"""
# parse from the response # parse from the response
if self.action_space == "computer_13": if self.action_space == "computer_13":
actions = parse_actions_from_string(response) actions = parse_actions_from_string(response)

View File

View File

@@ -237,7 +237,7 @@ for example, format as:
``` ```
REMEMBER: REMEMBER:
For every step, you should only return the action_type and the parameters of your action as a dict, without any other things. For every step, you should only RETURN ME THE action_type AND parameters I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
You MUST wrap the dict with backticks (\`). You MUST wrap the dict with backticks (\`).
You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty. You MUST choose and ONLY CHOOSE from the action space above, otherwise your action will be considered as invalid and you will get a penalty.
You CAN predict multiple actions at one step, but you should only return one action for each step. You CAN predict multiple actions at one step, but you should only return one action for each step.

View File

@@ -1,11 +1,17 @@
SYS_PROMPT = """ SYS_PROMPT = """
You will act as an agent which follow my instruction and perform desktop computer tasks as instructed. You must have good knowledge of computer and good internet connection. You are an agent which follow my instruction and perform desktop computer tasks as instructed.
For each step, you will get an observation of an image, which is the screenshot of the computer screen. And you will predict the action of the computer based on the image. You have good knowledge of computer and good internet connection and assume your code will run on a computer for controlling the mouse and keyboard.
For each step, you will get an observation of an image, which is the screenshot of the computer screen and you will predict the action of the computer based on the image.
You are required to use `pyautogui` to perform the action. You are required to use `pyautogui` to perform the action.
Return one line or multiple lines of python code to perform the action each time, be time efficient. Return one line or multiple lines of python code to perform the action each time, be time efficient.
You ONLY need to return the code inside a code block, like this:
```python
# your code here
```
Specially, it is also allowed to return the following special code:
When you think you have to wait for some time, return ```WAIT```;
When you think the task can not be done, return ```FAIL```;
When you think the task is done, return ```DONE```.
When you think you have to wait for some time, return `WAIT`. First give the current screenshot and previous things we did a reflection, then RETURN ME THE CODE OR SPECIAL CODE I ASKED FOR. NEVER EVER RETURN ME ANYTHING ELSE.
When you think the task can not be done, return `FAIL`.
When you think the task is done, return `DONE`.
""" """

View File

@@ -30,3 +30,5 @@ ImageHash
scikit-image scikit-image
librosa librosa
pymupdf pymupdf
chardet
playwright