Merge branch 'main' into zdy

This commit is contained in:
David Chang
2024-02-01 19:08:25 +08:00
20 changed files with 198 additions and 314 deletions

View File

@@ -30,7 +30,7 @@ def _execute_command(command: List[str]) -> None:
p = subprocess.Popen(command)
p.wait()
else:
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True, encoding="utf-8")
if result.returncode != 0:
raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
return result.stdout

View File

@@ -328,6 +328,9 @@ def check_structure_sim(src_path, tgt_path):
Check if the structure of the two images are similar
gimp:2a729ded-3296-423d-aec4-7dd55ed5fbb3
"""
if src_path is None or tgt_path is None:
return 0.
img_src = Image.open(src_path)
img_tgt = Image.open(tgt_path)
structure_same = structure_check_by_ssim(img_src, img_tgt)

View File

@@ -1,7 +1,7 @@
{
"id": "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
"snapshot": "libreoffice_calc",
"instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a which space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
"instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a white space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
"source": "https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/",
"config": [
{

View File

@@ -1,7 +1,7 @@
{
"id": "7a4e4bc8-922c-4c84-865c-25ba34136be1",
"snapshot": "libreoffice_calc",
"instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
"instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
"source": "https://www.youtube.com/shorts/bvUhr1AHs44",
"config": [
{

View File

@@ -1,7 +1,7 @@
{
"id": "21760ecb-8f62-40d2-8d85-0cee5725cb72",
"snapshot": "libreoffice_impress",
"instruction": "Could you help me add silde transition \"dissolve\" to my first page?",
"instruction": "Could you help me add slide transition \"dissolve\" to my first page?",
"source": "https://www.libreofficehelp.com/add-animations-transitions-libreoffice-impress-slides/",
"config": [
{

View File

@@ -1,7 +1,7 @@
{
"id": "5d901039-a89c-4bfb-967b-bf66f4df075e",
"snapshot": "libreoffice_impress",
"instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.",
"instruction": "I want to turn the rectangular image of Columbus on the first page into a cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image?",
"source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag",
"config": [
{

View File

@@ -62,7 +62,7 @@
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842",
"dest": "MLA_Workshop_061X_Works_Cited_Gold.docx"
"dest": "MLA_Workshop_061X_Works_Cited_Gold.pptx"
},
"result": {
"type": "vm_file",

View File

@@ -2,7 +2,7 @@
"id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
"snapshot": "libreoffice_impress",
"instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide use Impress \"Summary Slide\" feature. Could you make that for me?",
"source": "https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.",
"source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom",
"config": [
{
"type": "download",
@@ -62,7 +62,7 @@
"expected": {
"type": "cloud_file",
"path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474",
"dest": "Forests_Gold.docx"
"dest": "Forests_Gold.pptx"
},
"result": {
"type": "vm_file",

View File

@@ -27,7 +27,7 @@
"command": [
"python",
"-c",
"import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=40, interval=0.1); time.sleep(1); pyautogui.scroll(-2)"
"import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=40, interval=10); time.sleep(1); pyautogui.scroll(-2)"
]
}
}

View File

@@ -38,7 +38,7 @@
"command": [
"python",
"-c",
"import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=8); time.sleep(1); pyautogui.scroll(-2)"
"import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=3); time.sleep(1); pyautogui.scroll(-2)"
]
}
}

View File

@@ -68,7 +68,7 @@
"parameters": {
"command": [
"tar",
"-xzv",
"-xz",
"--recursive-unlink",
"-f",
"/home/user/thunderbird-profile.tar.gz",

View File

@@ -1,7 +1,7 @@
{
"id": "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
"snapshot": "chrome",
"instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store it in the forms/ folder in my Google Drive.",
"instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store the PDF in the forms/ folder in my Google Drive.",
"source": "https://marketplace.uipath.com/listings/convert-word-file-to-pdf-and-store-in-onedrive",
"config": [
{

View File

@@ -68,13 +68,13 @@
"type": "execute",
"parameters": {
"command": [
"tar",
"-xzv",
"--recursive-unlink",
"-f",
"/home/user/thunderbird-profile.tar.gz",
"-C",
"/home/user/"
"tar",
"-xz",
"--recursive-unlink",
"-f",
"/home/user/thunderbird-profile.tar.gz",
"-C",
"/home/user/"
]
}
},

View File

@@ -1,7 +1,7 @@
{
"id": "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
"snapshot": "os",
"instruction": "Can you move the file with the path 'todo.txt' on the Desktop to the directory with the path 'done' on the Desktop?",
"instruction": "Can you move the file 'todo.txt' on the Desktop to the directory 'done/' on the Desktop?",
"source": "https://ubuntu.com/tutorials/command-line-for-beginners#5-moving-and-manipulating-files",
"config": [
{

View File

@@ -1,7 +1,7 @@
{
"id": "bedcedc4-4d72-425e-ad62-21960b11fe0d",
"snapshot": "os",
"instruction": "Could you set the 'Dim screen when inactive' to on in setting?",
"instruction": "Could you set the 'Dim screen when inactive' to off in setting?",
"source": "https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s",
"trajectory": "trajectories/",
"config": [

View File

@@ -1 +1 @@
{"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-01-31T14:41:25Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
{"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-02-01T08:29:08Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}

View File

@@ -1,136 +0,0 @@
# todo: needs to be refactored
import time
from typing import Dict, List
import google.generativeai as genai
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
class GeminiPro_Agent:
def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0,
action_space="computer_13"):
genai.configure(api_key=api_key)
self.instruction = instruction
self.model = genai.GenerativeModel(model)
self.max_tokens = max_tokens
self.temperature = temperature
self.action_space = action_space
self.trajectory = [
{
"role": "system",
"parts": [
{
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
]
}
]
def predict(self, obs: Dict) -> List:
"""
Predict the next action(s) based on the current observation.
Only support single-round conversation, only fill-in the last desktop screenshot.
"""
accessibility_tree = obs["accessibility_tree"]
leaf_nodes = find_leaf_nodes(accessibility_tree)
filtered_nodes = filter_nodes(leaf_nodes)
linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
# Linearize the accessibility tree nodes into a table format
for node in filtered_nodes:
linearized_accessibility_tree += node.tag + "\t"
linearized_accessibility_tree += node.attrib.get('name') + "\t"
linearized_accessibility_tree += node.attrib.get(
'{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
self.trajectory.append({
"role": "user",
"parts": [
"Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
linearized_accessibility_tree)]
})
# todo: Remove this step once the Gemini supports multi-round conversation
all_message_str = ""
for i in range(len(self.trajectory)):
if i == 0:
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
elif i % 2 == 1:
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
else:
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
print("All message: >>>>>>>>>>>>>>>> ")
print(
all_message_str
)
message_for_gemini = {
"role": "user",
"parts": [all_message_str]
}
traj_to_show = []
for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["parts"][0])
if len(self.trajectory[i]["parts"]) > 1:
traj_to_show.append("screenshot_obs")
print("Trajectory:", traj_to_show)
while True:
try:
response = self.model.generate_content(
message_for_gemini,
generation_config={
"max_output_tokens": self.max_tokens,
"temperature": self.temperature
}
)
break
except:
print("Failed to generate response, retrying...")
time.sleep(5)
pass
try:
response_text = response.text
except:
return []
try:
actions = self.parse_actions(response_text)
except:
print("Failed to parse action from response:", response_text)
actions = []
return actions
def parse_actions(self, response: str):
# parse from the response
if self.action_space == "computer_13":
actions = parse_actions_from_string(response)
elif self.action_space == "pyautogui":
actions = parse_code_from_string(response)
else:
raise ValueError("Invalid action space: " + self.action_space)
# add action into the trajectory
self.trajectory.append({
"role": "assistant",
"parts": [response]
})
return actions

View File

@@ -1,115 +0,0 @@
# todo: needs to be refactored
import time
from typing import Dict, List
import PIL.Image
import google.generativeai as genai
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
class GeminiProV_Agent:
def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
action_space="computer_13"):
genai.configure(api_key=api_key)
self.instruction = instruction
self.model = genai.GenerativeModel(model)
self.max_tokens = max_tokens
self.temperature = temperature
self.action_space = action_space
self.trajectory = [
{
"role": "system",
"parts": [
{
"computer_13": SYS_PROMPT_ACTION,
"pyautogui": SYS_PROMPT_CODE
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
]
}
]
def predict(self, obs: Dict) -> List:
"""
Predict the next action(s) based on the current observation.
Only support single-round conversation, only fill-in the last desktop screenshot.
"""
img = PIL.Image.open(obs["screenshot"])
self.trajectory.append({
"role": "user",
"parts": ["What's the next step that you will do to help with the task?", img]
})
# todo: Remove this step once the Gemini supports multi-round conversation
all_message_str = ""
for i in range(len(self.trajectory)):
if i == 0:
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
elif i % 2 == 1:
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
else:
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
message_for_gemini = {
"role": "user",
"parts": [all_message_str, img]
}
traj_to_show = []
for i in range(len(self.trajectory)):
traj_to_show.append(self.trajectory[i]["parts"][0])
if len(self.trajectory[i]["parts"]) > 1:
traj_to_show.append("screenshot_obs")
print("Trajectory:", traj_to_show)
while True:
try:
response = self.model.generate_content(
message_for_gemini,
generation_config={
"max_output_tokens": self.max_tokens,
"temperature": self.temperature
}
)
break
except:
print("Failed to generate response, retrying...")
time.sleep(5)
pass
try:
response_text = response.text
except:
return []
try:
actions = self.parse_actions(response_text)
except:
print("Failed to parse action from response:", response_text)
actions = []
return actions
def parse_actions(self, response: str):
# parse from the response
if self.action_space == "computer_13":
actions = parse_actions_from_string(response)
elif self.action_space == "pyautogui":
actions = parse_code_from_string(response)
else:
raise ValueError("Invalid action space: " + self.action_space)
# add action into the trajectory
self.trajectory.append({
"role": "assistant",
"parts": [response]
})
return actions

View File

@@ -1,12 +1,20 @@
import base64
import json
import logging
import os
import re
import time
import uuid
from http import HTTPStatus
from io import BytesIO
from typing import Dict, List
import backoff
import dashscope
import google.generativeai as genai
import openai
import requests
from PIL import Image
from openai.error import (
APIConnectionError,
APIError,
@@ -44,11 +52,13 @@ def linearize_accessibility_tree(accessibility_tree):
linearized_accessibility_tree += node.tag + "\t"
linearized_accessibility_tree += node.attrib.get('name') + "\t"
if node.text:
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t"
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper")\
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
node.text.replace('"', '""'))) + "\t"
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(text.replace('"', '""'))) + "\t"
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
text.replace('"', '""'))) + "\t"
else:
linearized_accessibility_tree += '""\t'
linearized_accessibility_tree += node.attrib.get(
@@ -140,16 +150,21 @@ def parse_code_from_string(input_string):
def parse_code_from_som_string(input_string, masks):
# parse the output string by masks
mappings = []
tag_vars = ""
for i, mask in enumerate(masks):
x, y, w, h = mask
mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
# reverse the mappings
for mapping in mappings[::-1]:
input_string = input_string.replace(mapping[0], mapping[1])
tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
tag_vars += "\n"
actions = parse_code_from_string(input_string)
for i, action in enumerate(actions):
if action.strip() in ['WAIT', 'DONE', 'FAIL']:
pass
else:
action = tag_vars + action
actions[i] = action
return actions
@@ -295,7 +310,7 @@ class GPT4v_Agent:
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{_screenshot}",
"url": f"data:image/png;base64,{_screenshot}",
"detail": "high"
}
}
@@ -314,7 +329,7 @@ class GPT4v_Agent:
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{_screenshot}",
"url": f"data:image/png;base64,{_screenshot}",
"detail": "high"
}
}
@@ -375,7 +390,7 @@ class GPT4v_Agent:
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"url": f"data:image/png;base64,{base64_image}",
"detail": "high"
}
}
@@ -421,7 +436,7 @@ class GPT4v_Agent:
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"url": f"data:image/png;base64,{base64_image}",
"detail": "high"
}
}
@@ -448,7 +463,7 @@ class GPT4v_Agent:
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
"url": f"data:image/png;base64,{base64_image}",
"detail": "high"
}
}
@@ -515,33 +530,150 @@ class GPT4v_Agent:
@backoff.on_exception(
backoff.expo,
(APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError),
max_tries=3
max_tries=10
)
def call_llm(self, payload):
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload,
timeout=20
)
if self.model.startswith("gpt"):
response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload
)
if response.status_code != 200:
if response.json()['error']['code'] == "context_length_exceeded":
print("Context length exceeded. Retrying with a smaller context.")
payload["messages"] = payload["messages"][-1:]
retry_response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload
)
if retry_response.status_code != 200:
print("Failed to call LLM: " + retry_response.text)
if response.status_code != 200:
if response.json()['error']['code'] == "context_length_exceeded":
print("Context length exceeded. Retrying with a smaller context.")
payload["messages"] = payload["messages"][-1:]
retry_response = requests.post(
"https://api.openai.com/v1/chat/completions",
headers=self.headers,
json=payload
)
if retry_response.status_code != 200:
print("Failed to call LLM: " + retry_response.text)
return ""
print("Failed to call LLM: " + response.text)
time.sleep(5)
return ""
else:
return response.json()['choices'][0]['message']['content']
elif self.model.startswith("mistral"):
print("call mistral")
messages = payload["messages"]
max_tokens = payload["max_tokens"]
misrtal_messages = []
for i, message in enumerate(messages):
mistral_message = {
"role": message["role"],
"content": []
}
for part in message["content"]:
mistral_message['content'] = part['text'] if part['type'] == "text" else None
misrtal_messages.append(mistral_message)
# the mistral not support system message in our endpoint, so we concatenate it at the first user message
if misrtal_messages[0]['role'] == "system":
misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
misrtal_messages.pop(0)
openai.api_base = "http://localhost:8000/v1"
openai.api_key = "test"
response = openai.ChatCompletion.create(
messages=misrtal_messages,
model="Mixtral-8x7B-Instruct-v0.1"
)
try:
return response['choices'][0]['message']['content']
except Exception as e:
print("Failed to call LLM: " + str(e))
return ""
elif self.model.startswith("gemini"):
api_key = os.environ.get("GENAI_API_KEY")
genai.api_key = api_key
def encoded_img_to_pil_img(data_str):
base64_str = data_str.replace("data:image/png;base64,", "")
image_data = base64.b64decode(base64_str)
image = Image.open(BytesIO(image_data))
return image
messages = payload["messages"]
max_tokens = payload["max_tokens"]
gemini_messages = []
for i, message in enumerate(messages):
gemini_message = {
"role": message["role"],
"parts": []
}
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
# The gemini only support the last image as single image input
if i == len(messages) - 1:
for part in message["content"]:
gemini_message['parts'].append(part['text']) if part['type'] == "text" \
else gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url']))
else:
for part in message["content"]:
gemini_message['parts'].append(part['text']) if part['type'] == "text" else None
gemini_messages.append(gemini_message)
response = genai.GenerativeModel(self.model).generate_content(
gemini_messages,
generation_config={
"max_output_tokens": max_tokens
}
)
try:
return response.text
except Exception as e:
return ""
elif self.model.startswith("qwen"):
messages = payload["messages"]
max_tokens = payload["max_tokens"]
qwen_messages = []
for i, message in enumerate(messages):
qwen_message = {
"role": message["role"],
"content": []
}
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
for part in message["content"]:
qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None
qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None
qwen_messages.append(qwen_message)
response = dashscope.MultiModalConversation.call(model='qwen-vl-plus',
messages=messages)
# The response status_code is HTTPStatus.OK indicate success,
# otherwise indicate request is failed, you can get error code
# and message from code and message.
if response.status_code == HTTPStatus.OK:
try:
return response.json()['output']['choices'][0]['message']['content']
except Exception as e:
return ""
else:
print(response.code) # The error code.
print(response.message) # The error message.
return ""
print("Failed to call LLM: " + response.text)
return ""
else:
return response.json()['choices'][0]['message']['content']
raise ValueError("Invalid model: " + self.model)
def parse_actions(self, response: str, masks=None):

View File

@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as:
```python
pyautogui.moveTo(tag#3)
pyautogui.click(tag#2)
pyautogui.dragTo(tag#1, button='left')
pyautogui.moveTo(tag_3)
pyautogui.click(tag_2)
pyautogui.dragTo(tag_1, button='left')
```
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
But you should be careful to ensure that the coordinates are correct.
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
You can replace x, y in the code with the tag of the element you want to operate with. such as:
```python
pyautogui.moveTo(tag#3)
pyautogui.click(tag#2)
pyautogui.dragTo(tag#1, button='left')
pyautogui.moveTo(tag_3)
pyautogui.click(tag_2)
pyautogui.dragTo(tag_1, button='left')
```
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
But you should be careful to ensure that the coordinates are correct.