Add raw accessibility-tree based prompting method (but the tokens are too large); Minor fix some small bugs
This commit is contained in:
110
mm_agents/gemini_pro_agent.py
Normal file
110
mm_agents/gemini_pro_agent.py
Normal file
@@ -0,0 +1,110 @@
|
||||
from typing import Dict, List
|
||||
|
||||
import PIL.Image
|
||||
import google.generativeai as genai
|
||||
|
||||
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
|
||||
from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
|
||||
from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
|
||||
|
||||
|
||||
class GeminiPro_Agent:
|
||||
def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0,
|
||||
action_space="computer_13"):
|
||||
genai.configure(api_key=api_key)
|
||||
self.instruction = instruction
|
||||
self.model = genai.GenerativeModel(model)
|
||||
self.max_tokens = max_tokens
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
|
||||
self.trajectory = [
|
||||
{
|
||||
"role": "system",
|
||||
"parts": [
|
||||
{
|
||||
"computer_13": SYS_PROMPT_ACTION,
|
||||
"pyautogui": SYS_PROMPT_CODE
|
||||
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
def predict(self, obs: Dict) -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
Only support single-round conversation, only fill-in the last desktop screenshot.
|
||||
"""
|
||||
accessibility_tree = obs["accessibility_tree"]
|
||||
self.trajectory.append({
|
||||
"role": "user",
|
||||
"parts": ["Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(accessibility_tree)]
|
||||
})
|
||||
|
||||
# todo: Remove this step once the Gemini supports multi-round conversation
|
||||
all_message_str = ""
|
||||
for i in range(len(self.trajectory)):
|
||||
if i == 0:
|
||||
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
|
||||
elif i % 2 == 1:
|
||||
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
|
||||
else:
|
||||
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
|
||||
|
||||
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
|
||||
|
||||
print("All message: >>>>>>>>>>>>>>>> ")
|
||||
print(
|
||||
all_message_str
|
||||
)
|
||||
|
||||
message_for_gemini = {
|
||||
"role": "user",
|
||||
"parts": [all_message_str]
|
||||
}
|
||||
|
||||
traj_to_show = []
|
||||
for i in range(len(self.trajectory)):
|
||||
traj_to_show.append(self.trajectory[i]["parts"][0])
|
||||
if len(self.trajectory[i]["parts"]) > 1:
|
||||
traj_to_show.append("screenshot_obs")
|
||||
|
||||
print("Trajectory:", traj_to_show)
|
||||
|
||||
response = self.model.generate_content(
|
||||
message_for_gemini,
|
||||
generation_config={
|
||||
"max_output_tokens": self.max_tokens,
|
||||
"temperature": self.temperature
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
response_text = response.text
|
||||
except:
|
||||
return []
|
||||
|
||||
try:
|
||||
actions = self.parse_actions(response_text)
|
||||
except:
|
||||
print("Failed to parse action from response:", response_text)
|
||||
actions = []
|
||||
|
||||
return actions
|
||||
|
||||
def parse_actions(self, response: str):
|
||||
# parse from the response
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
elif self.action_space == "pyautogui":
|
||||
actions = parse_code_from_string(response)
|
||||
else:
|
||||
raise ValueError("Invalid action space: " + self.action_space)
|
||||
|
||||
# add action into the trajectory
|
||||
self.trajectory.append({
|
||||
"role": "assistant",
|
||||
"parts": [response]
|
||||
})
|
||||
|
||||
return actions
|
||||
Reference in New Issue
Block a user