Merge branch 'main' into zdy
This commit is contained in:
@@ -30,7 +30,7 @@ def _execute_command(command: List[str]) -> None:
|
||||
p = subprocess.Popen(command)
|
||||
p.wait()
|
||||
else:
|
||||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True)
|
||||
result = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=60, text=True, encoding="utf-8")
|
||||
if result.returncode != 0:
|
||||
raise Exception("\033[91m" + result.stdout + result.stderr + "\033[0m")
|
||||
return result.stdout
|
||||
|
||||
@@ -328,6 +328,9 @@ def check_structure_sim(src_path, tgt_path):
|
||||
Check if the structure of the two images are similar
|
||||
gimp:2a729ded-3296-423d-aec4-7dd55ed5fbb3
|
||||
"""
|
||||
if src_path is None or tgt_path is None:
|
||||
return 0.
|
||||
|
||||
img_src = Image.open(src_path)
|
||||
img_tgt = Image.open(tgt_path)
|
||||
structure_same = structure_check_by_ssim(img_src, img_tgt)
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "0cecd4f3-74de-457b-ba94-29ad6b5dafb6",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a which space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
|
||||
"instruction": "Rename \"Sheet 1\" to \"LARS Resources\". Then make a copy of it. Place the copy before \"Sheet 2\" and rename it by appending a suffix \"(Backup)\", concatenated by a white space. And Also rename \"Sheet2\" by appending the suffix \"(Offline)\".",
|
||||
"source": "https://www.libreofficehelp.com/add-insert-delete-copy-move-rename-a-worksheet-in-libreoffice-calc/",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "7a4e4bc8-922c-4c84-865c-25ba34136be1",
|
||||
"snapshot": "libreoffice_calc",
|
||||
"instruction": "Reorder the columns to be \"Data\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
|
||||
"instruction": "Reorder the columns to be \"Date\", \"First Name\", \"Last Name\", \"Order ID\", \"Sales\"",
|
||||
"source": "https://www.youtube.com/shorts/bvUhr1AHs44",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "21760ecb-8f62-40d2-8d85-0cee5725cb72",
|
||||
"snapshot": "libreoffice_impress",
|
||||
"instruction": "Could you help me add silde transition \"dissolve\" to my first page?",
|
||||
"instruction": "Could you help me add slide transition \"dissolve\" to my first page?",
|
||||
"source": "https://www.libreofficehelp.com/add-animations-transitions-libreoffice-impress-slides/",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "5d901039-a89c-4bfb-967b-bf66f4df075e",
|
||||
"snapshot": "libreoffice_impress",
|
||||
"instruction": "I want to make this page my cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image.",
|
||||
"instruction": "I want to turn the rectangular image of Columbus on the first page into a cover page. Could you help me stretch this image to fill the entire page, keeping its proportion and centering the image?",
|
||||
"source": "https://superuser.com/questions/986776/how-can-i-stretch-an-image-in-a-libreoffice-impress-presentation-to-fill-the-pag",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -62,7 +62,7 @@
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://drive.usercontent.google.com/download?id=1otbzscpOZ0tCXMvsMC0MmNWUC7Pv71of&export=download&authuser=0&confirm=t&uuid=faa0b0c1-6b14-4bce-a1fd-ccf824ee1e60&at=APZUnTXw6TlBOlrPPZ2OhfGnNPf0:1705338135842",
|
||||
"dest": "MLA_Workshop_061X_Works_Cited_Gold.docx"
|
||||
"dest": "MLA_Workshop_061X_Works_Cited_Gold.pptx"
|
||||
},
|
||||
"result": {
|
||||
"type": "vm_file",
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"id": "af23762e-2bfd-4a1d-aada-20fa8de9ce07",
|
||||
"snapshot": "libreoffice_impress",
|
||||
"instruction": "I am making PPT on LibreOffice Impress for presentation tomorrow. I need to summarize contents on one slide use Impress \"Summary Slide\" feature. Could you make that for me?",
|
||||
"source": "https://www.libreofficehelp.com/export-libreoffice-impress-slides-images/#:~:text=Exporting%20a%20single%20slide%20as.jpg%2C.png%2C%20etc%20image%20is,on%20the%20checkbox%20Selection.%20Provide%20jpg%20quality%20options.",
|
||||
"source": "https://superuser.com/questions/1059080/how-to-make-a-summary-slide-in-impress-listing-the-titles-of-all-slides-autom",
|
||||
"config": [
|
||||
{
|
||||
"type": "download",
|
||||
@@ -62,7 +62,7 @@
|
||||
"expected": {
|
||||
"type": "cloud_file",
|
||||
"path": "https://drive.usercontent.google.com/download?id=1nRwmFgYdskv3EiriZZFoT8TzM9CsG5B0&export=download&authuser=0&confirm=t&uuid=f2f919df-2867-4bc3-8bb9-dabd51108ebb&at=APZUnTWzw9LJWWXvH0cvdaWL-Ij-:1705319339474",
|
||||
"dest": "Forests_Gold.docx"
|
||||
"dest": "Forests_Gold.pptx"
|
||||
},
|
||||
"result": {
|
||||
"type": "vm_file",
|
||||
|
||||
@@ -27,7 +27,7 @@
|
||||
"command": [
|
||||
"python",
|
||||
"-c",
|
||||
"import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=40, interval=0.1); time.sleep(1); pyautogui.scroll(-2)"
|
||||
"import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=40, interval=10); time.sleep(1); pyautogui.scroll(-2)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
"command": [
|
||||
"python",
|
||||
"-c",
|
||||
"import pyautogui; import time; time.sleep(1); pyautogui.press(\"down\", presses=8); time.sleep(1); pyautogui.scroll(-2)"
|
||||
"import pyautogui; import time; time.sleep(5); pyautogui.press(\"down\", presses=8, interval=3); time.sleep(1); pyautogui.scroll(-2)"
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -68,7 +68,7 @@
|
||||
"parameters": {
|
||||
"command": [
|
||||
"tar",
|
||||
"-xzv",
|
||||
"-xz",
|
||||
"--recursive-unlink",
|
||||
"-f",
|
||||
"/home/user/thunderbird-profile.tar.gz",
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "897e3b53-5d4d-444b-85cb-2cdc8a97d903",
|
||||
"snapshot": "chrome",
|
||||
"instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store it in the forms/ folder in my Google Drive.",
|
||||
"instruction": "I have a LibreOffice Writer file form.docx on the desktop. Help me convert it to PDF format and store the PDF in the forms/ folder in my Google Drive.",
|
||||
"source": "https://marketplace.uipath.com/listings/convert-word-file-to-pdf-and-store-in-onedrive",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -68,13 +68,13 @@
|
||||
"type": "execute",
|
||||
"parameters": {
|
||||
"command": [
|
||||
"tar",
|
||||
"-xzv",
|
||||
"--recursive-unlink",
|
||||
"-f",
|
||||
"/home/user/thunderbird-profile.tar.gz",
|
||||
"-C",
|
||||
"/home/user/"
|
||||
"tar",
|
||||
"-xz",
|
||||
"--recursive-unlink",
|
||||
"-f",
|
||||
"/home/user/thunderbird-profile.tar.gz",
|
||||
"-C",
|
||||
"/home/user/"
|
||||
]
|
||||
}
|
||||
},
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82",
|
||||
"snapshot": "os",
|
||||
"instruction": "Can you move the file with the path 'todo.txt' on the Desktop to the directory with the path 'done' on the Desktop?",
|
||||
"instruction": "Can you move the file 'todo.txt' on the Desktop to the directory 'done/' on the Desktop?",
|
||||
"source": "https://ubuntu.com/tutorials/command-line-for-beginners#5-moving-and-manipulating-files",
|
||||
"config": [
|
||||
{
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
{
|
||||
"id": "bedcedc4-4d72-425e-ad62-21960b11fe0d",
|
||||
"snapshot": "os",
|
||||
"instruction": "Could you set the 'Dim screen when inactive' to on in setting?",
|
||||
"instruction": "Could you set the 'Dim screen when inactive' to off in setting?",
|
||||
"source": "https://www.youtube.com/watch?v=D4WyNjt_hbQ&t=2s",
|
||||
"trajectory": "trajectories/",
|
||||
"config": [
|
||||
|
||||
@@ -1 +1 @@
|
||||
{"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-01-31T14:41:25Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byAZmDTDsYds_iatV8a30PUPWcDHVW4Cyg71pTlD0f3eBBwAjV4WpVL8LdAle8sT4j_rX4rWH8iCt3QI2YdrQLFPlaVdBk0zRGGtAEcebIDuQy_VKD6j5c3IGxok9PDON-Mft0ZVJjUVEopgLYA4fYwctbQZ8nyl4AaCgYKAX4SARISFQHGX2Mim-LRNXCfACmecJH94-D09A0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
|
||||
{"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "client_id": "786888752612-rgng5v9hcq4as7pn0b40gt9r5lekmht9.apps.googleusercontent.com", "client_secret": "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", "refresh_token": "1//0ehtafHmucszRCgYIARAAGA4SNwF-L9IrpDBsnzdHKAlRfrkvzNFw1cpdnRY8rhM5gy4flsPYdysMav27yHamJx39BBGq-LLw40s", "token_expiry": "2024-02-01T08:29:08Z", "token_uri": "https://oauth2.googleapis.com/token", "user_agent": null, "revoke_uri": "https://oauth2.googleapis.com/revoke", "id_token": null, "id_token_jwt": null, "token_response": {"access_token": "ya29.a0AfB_byD2_A8RPm6KzfjaNifjfgZ2M-D9G16GAPIj1ANxM3AMq4DLUXxj76CalsByOsqTUgEvADd-FEKL0FkBBc4ow-EuaLUEOm4yw2LNEOFMhVD_k4PvEIf4767fYU5o__GtyrGt5pNJy0MaBukDY2ui7GQwDuFFGt2q_AaCgYKAYMSARISFQHGX2MidvFUuBpNsy4fkC5DP2k8Aw0173", "expires_in": 3599, "scope": "https://www.googleapis.com/auth/drive", "token_type": "Bearer"}, "scopes": ["https://www.googleapis.com/auth/drive"], "token_info_uri": "https://oauth2.googleapis.com/tokeninfo", "invalid": false, "_class": "OAuth2Credentials", "_module": "oauth2client.client"}
|
||||
@@ -1,136 +0,0 @@
|
||||
# todo: needs to be refactored
|
||||
|
||||
import time
|
||||
from typing import Dict, List
|
||||
|
||||
import google.generativeai as genai
|
||||
|
||||
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes
|
||||
from mm_agents.gpt_4_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
|
||||
from mm_agents.gpt_4_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
|
||||
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
|
||||
|
||||
|
||||
class GeminiPro_Agent:
|
||||
def __init__(self, api_key, instruction, model='gemini-pro', max_tokens=300, temperature=0.0,
|
||||
action_space="computer_13"):
|
||||
genai.configure(api_key=api_key)
|
||||
self.instruction = instruction
|
||||
self.model = genai.GenerativeModel(model)
|
||||
self.max_tokens = max_tokens
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
|
||||
self.trajectory = [
|
||||
{
|
||||
"role": "system",
|
||||
"parts": [
|
||||
{
|
||||
"computer_13": SYS_PROMPT_ACTION,
|
||||
"pyautogui": SYS_PROMPT_CODE
|
||||
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
def predict(self, obs: Dict) -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
Only support single-round conversation, only fill-in the last desktop screenshot.
|
||||
"""
|
||||
accessibility_tree = obs["accessibility_tree"]
|
||||
|
||||
leaf_nodes = find_leaf_nodes(accessibility_tree)
|
||||
filtered_nodes = filter_nodes(leaf_nodes)
|
||||
|
||||
linearized_accessibility_tree = "tag\ttext\tposition\tsize\n"
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
|
||||
for node in filtered_nodes:
|
||||
linearized_accessibility_tree += node.tag + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get(
|
||||
'{uri:deskat:component.at-spi.gnome.org}screencoord') + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('{uri:deskat:component.at-spi.gnome.org}size') + "\n"
|
||||
|
||||
self.trajectory.append({
|
||||
"role": "user",
|
||||
"parts": [
|
||||
"Given the XML format of accessibility tree (convert and formatted into table) as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
|
||||
linearized_accessibility_tree)]
|
||||
})
|
||||
|
||||
# todo: Remove this step once the Gemini supports multi-round conversation
|
||||
all_message_str = ""
|
||||
for i in range(len(self.trajectory)):
|
||||
if i == 0:
|
||||
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
|
||||
elif i % 2 == 1:
|
||||
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
|
||||
else:
|
||||
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
|
||||
|
||||
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
|
||||
|
||||
print("All message: >>>>>>>>>>>>>>>> ")
|
||||
print(
|
||||
all_message_str
|
||||
)
|
||||
|
||||
message_for_gemini = {
|
||||
"role": "user",
|
||||
"parts": [all_message_str]
|
||||
}
|
||||
|
||||
traj_to_show = []
|
||||
for i in range(len(self.trajectory)):
|
||||
traj_to_show.append(self.trajectory[i]["parts"][0])
|
||||
if len(self.trajectory[i]["parts"]) > 1:
|
||||
traj_to_show.append("screenshot_obs")
|
||||
|
||||
print("Trajectory:", traj_to_show)
|
||||
|
||||
while True:
|
||||
try:
|
||||
response = self.model.generate_content(
|
||||
message_for_gemini,
|
||||
generation_config={
|
||||
"max_output_tokens": self.max_tokens,
|
||||
"temperature": self.temperature
|
||||
}
|
||||
)
|
||||
break
|
||||
except:
|
||||
print("Failed to generate response, retrying...")
|
||||
time.sleep(5)
|
||||
pass
|
||||
|
||||
try:
|
||||
response_text = response.text
|
||||
except:
|
||||
return []
|
||||
|
||||
try:
|
||||
actions = self.parse_actions(response_text)
|
||||
except:
|
||||
print("Failed to parse action from response:", response_text)
|
||||
actions = []
|
||||
|
||||
return actions
|
||||
|
||||
def parse_actions(self, response: str):
|
||||
# parse from the response
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
elif self.action_space == "pyautogui":
|
||||
actions = parse_code_from_string(response)
|
||||
else:
|
||||
raise ValueError("Invalid action space: " + self.action_space)
|
||||
|
||||
# add action into the trajectory
|
||||
self.trajectory.append({
|
||||
"role": "assistant",
|
||||
"parts": [response]
|
||||
})
|
||||
|
||||
return actions
|
||||
@@ -1,115 +0,0 @@
|
||||
# todo: needs to be refactored
|
||||
|
||||
import time
|
||||
from typing import Dict, List
|
||||
|
||||
import PIL.Image
|
||||
import google.generativeai as genai
|
||||
|
||||
from mm_agents.gpt_4v_agent import parse_actions_from_string, parse_code_from_string
|
||||
from mm_agents.gpt_4v_prompt_action import SYS_PROMPT as SYS_PROMPT_ACTION
|
||||
from mm_agents.gpt_4v_prompt_code import SYS_PROMPT as SYS_PROMPT_CODE
|
||||
|
||||
|
||||
class GeminiProV_Agent:
|
||||
def __init__(self, api_key, instruction, model='gemini-pro-vision', max_tokens=300, temperature=0.0,
|
||||
action_space="computer_13"):
|
||||
genai.configure(api_key=api_key)
|
||||
self.instruction = instruction
|
||||
self.model = genai.GenerativeModel(model)
|
||||
self.max_tokens = max_tokens
|
||||
self.temperature = temperature
|
||||
self.action_space = action_space
|
||||
|
||||
self.trajectory = [
|
||||
{
|
||||
"role": "system",
|
||||
"parts": [
|
||||
{
|
||||
"computer_13": SYS_PROMPT_ACTION,
|
||||
"pyautogui": SYS_PROMPT_CODE
|
||||
}[action_space] + "\nHere is the instruction for the task: {}".format(self.instruction)
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
def predict(self, obs: Dict) -> List:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
Only support single-round conversation, only fill-in the last desktop screenshot.
|
||||
"""
|
||||
img = PIL.Image.open(obs["screenshot"])
|
||||
self.trajectory.append({
|
||||
"role": "user",
|
||||
"parts": ["What's the next step that you will do to help with the task?", img]
|
||||
})
|
||||
|
||||
# todo: Remove this step once the Gemini supports multi-round conversation
|
||||
all_message_str = ""
|
||||
for i in range(len(self.trajectory)):
|
||||
if i == 0:
|
||||
all_message_template = "<|im_start|>system\n{}\n<|im_end|>\n"
|
||||
elif i % 2 == 1:
|
||||
all_message_template = "<|im_start|>user\n{}\n<|im_end|>\n"
|
||||
else:
|
||||
all_message_template = "<|im_start|>assistant\n{}\n<|im_end|>\n"
|
||||
|
||||
all_message_str += all_message_template.format(self.trajectory[i]["parts"][0])
|
||||
|
||||
message_for_gemini = {
|
||||
"role": "user",
|
||||
"parts": [all_message_str, img]
|
||||
}
|
||||
|
||||
traj_to_show = []
|
||||
for i in range(len(self.trajectory)):
|
||||
traj_to_show.append(self.trajectory[i]["parts"][0])
|
||||
if len(self.trajectory[i]["parts"]) > 1:
|
||||
traj_to_show.append("screenshot_obs")
|
||||
|
||||
print("Trajectory:", traj_to_show)
|
||||
|
||||
while True:
|
||||
try:
|
||||
response = self.model.generate_content(
|
||||
message_for_gemini,
|
||||
generation_config={
|
||||
"max_output_tokens": self.max_tokens,
|
||||
"temperature": self.temperature
|
||||
}
|
||||
)
|
||||
break
|
||||
except:
|
||||
print("Failed to generate response, retrying...")
|
||||
time.sleep(5)
|
||||
pass
|
||||
|
||||
try:
|
||||
response_text = response.text
|
||||
except:
|
||||
return []
|
||||
|
||||
try:
|
||||
actions = self.parse_actions(response_text)
|
||||
except:
|
||||
print("Failed to parse action from response:", response_text)
|
||||
actions = []
|
||||
|
||||
return actions
|
||||
|
||||
def parse_actions(self, response: str):
|
||||
# parse from the response
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
elif self.action_space == "pyautogui":
|
||||
actions = parse_code_from_string(response)
|
||||
else:
|
||||
raise ValueError("Invalid action space: " + self.action_space)
|
||||
|
||||
# add action into the trajectory
|
||||
self.trajectory.append({
|
||||
"role": "assistant",
|
||||
"parts": [response]
|
||||
})
|
||||
|
||||
return actions
|
||||
@@ -1,12 +1,20 @@
|
||||
import base64
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import uuid
|
||||
from http import HTTPStatus
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
import backoff
|
||||
import dashscope
|
||||
import google.generativeai as genai
|
||||
import openai
|
||||
import requests
|
||||
from PIL import Image
|
||||
from openai.error import (
|
||||
APIConnectionError,
|
||||
APIError,
|
||||
@@ -44,11 +52,13 @@ def linearize_accessibility_tree(accessibility_tree):
|
||||
linearized_accessibility_tree += node.tag + "\t"
|
||||
linearized_accessibility_tree += node.attrib.get('name') + "\t"
|
||||
if node.text:
|
||||
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))) + "\t"
|
||||
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper")\
|
||||
linearized_accessibility_tree += (node.text if '"' not in node.text else '"{:}"'.format(
|
||||
node.text.replace('"', '""'))) + "\t"
|
||||
elif node.get("{uri:deskat:uia.windows.microsoft.org}class", "").endswith("EditWrapper") \
|
||||
and node.get("{uri:deskat:value.at-spi.gnome.org}value"):
|
||||
text: str = node.get("{uri:deskat:value.at-spi.gnome.org}value")
|
||||
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(text.replace('"', '""'))) + "\t"
|
||||
linearized_accessibility_tree += (text if '"' not in text else '"{:}"'.format(
|
||||
text.replace('"', '""'))) + "\t"
|
||||
else:
|
||||
linearized_accessibility_tree += '""\t'
|
||||
linearized_accessibility_tree += node.attrib.get(
|
||||
@@ -140,16 +150,21 @@ def parse_code_from_string(input_string):
|
||||
|
||||
def parse_code_from_som_string(input_string, masks):
|
||||
# parse the output string by masks
|
||||
mappings = []
|
||||
tag_vars = ""
|
||||
for i, mask in enumerate(masks):
|
||||
x, y, w, h = mask
|
||||
mappings.append(("tag#" + str(i + 1), "{}, {}".format(int(x + w // 2), int(y + h // 2))))
|
||||
|
||||
# reverse the mappings
|
||||
for mapping in mappings[::-1]:
|
||||
input_string = input_string.replace(mapping[0], mapping[1])
|
||||
tag_vars += "tag_" + str(i + 1) + "=" + "({}, {})".format(int(x + w // 2), int(y + h // 2))
|
||||
tag_vars += "\n"
|
||||
|
||||
actions = parse_code_from_string(input_string)
|
||||
|
||||
for i, action in enumerate(actions):
|
||||
if action.strip() in ['WAIT', 'DONE', 'FAIL']:
|
||||
pass
|
||||
else:
|
||||
action = tag_vars + action
|
||||
actions[i] = action
|
||||
|
||||
return actions
|
||||
|
||||
|
||||
@@ -295,7 +310,7 @@ class GPT4v_Agent:
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{_screenshot}",
|
||||
"url": f"data:image/png;base64,{_screenshot}",
|
||||
"detail": "high"
|
||||
}
|
||||
}
|
||||
@@ -314,7 +329,7 @@ class GPT4v_Agent:
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{_screenshot}",
|
||||
"url": f"data:image/png;base64,{_screenshot}",
|
||||
"detail": "high"
|
||||
}
|
||||
}
|
||||
@@ -375,7 +390,7 @@ class GPT4v_Agent:
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": "high"
|
||||
}
|
||||
}
|
||||
@@ -421,7 +436,7 @@ class GPT4v_Agent:
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": "high"
|
||||
}
|
||||
}
|
||||
@@ -448,7 +463,7 @@ class GPT4v_Agent:
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": f"data:image/jpeg;base64,{base64_image}",
|
||||
"url": f"data:image/png;base64,{base64_image}",
|
||||
"detail": "high"
|
||||
}
|
||||
}
|
||||
@@ -515,33 +530,150 @@ class GPT4v_Agent:
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(APIError, RateLimitError, APIConnectionError, ServiceUnavailableError, InvalidRequestError),
|
||||
max_tries=3
|
||||
max_tries=10
|
||||
)
|
||||
def call_llm(self, payload):
|
||||
response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload,
|
||||
timeout=20
|
||||
)
|
||||
if self.model.startswith("gpt"):
|
||||
response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload
|
||||
)
|
||||
|
||||
if response.status_code != 200:
|
||||
if response.json()['error']['code'] == "context_length_exceeded":
|
||||
print("Context length exceeded. Retrying with a smaller context.")
|
||||
payload["messages"] = payload["messages"][-1:]
|
||||
retry_response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload
|
||||
)
|
||||
if retry_response.status_code != 200:
|
||||
print("Failed to call LLM: " + retry_response.text)
|
||||
if response.status_code != 200:
|
||||
if response.json()['error']['code'] == "context_length_exceeded":
|
||||
print("Context length exceeded. Retrying with a smaller context.")
|
||||
payload["messages"] = payload["messages"][-1:]
|
||||
retry_response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=self.headers,
|
||||
json=payload
|
||||
)
|
||||
if retry_response.status_code != 200:
|
||||
print("Failed to call LLM: " + retry_response.text)
|
||||
return ""
|
||||
|
||||
print("Failed to call LLM: " + response.text)
|
||||
time.sleep(5)
|
||||
return ""
|
||||
else:
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
|
||||
elif self.model.startswith("mistral"):
|
||||
print("call mistral")
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
|
||||
misrtal_messages = []
|
||||
|
||||
for i, message in enumerate(messages):
|
||||
mistral_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
|
||||
for part in message["content"]:
|
||||
mistral_message['content'] = part['text'] if part['type'] == "text" else None
|
||||
|
||||
misrtal_messages.append(mistral_message)
|
||||
|
||||
# the mistral not support system message in our endpoint, so we concatenate it at the first user message
|
||||
if misrtal_messages[0]['role'] == "system":
|
||||
misrtal_messages[1]['content'] = misrtal_messages[0]['content'] + "\n" + misrtal_messages[1]['content']
|
||||
misrtal_messages.pop(0)
|
||||
|
||||
openai.api_base = "http://localhost:8000/v1"
|
||||
openai.api_key = "test"
|
||||
response = openai.ChatCompletion.create(
|
||||
messages=misrtal_messages,
|
||||
model="Mixtral-8x7B-Instruct-v0.1"
|
||||
)
|
||||
|
||||
try:
|
||||
return response['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
print("Failed to call LLM: " + str(e))
|
||||
return ""
|
||||
|
||||
elif self.model.startswith("gemini"):
|
||||
|
||||
api_key = os.environ.get("GENAI_API_KEY")
|
||||
genai.api_key = api_key
|
||||
def encoded_img_to_pil_img(data_str):
|
||||
base64_str = data_str.replace("data:image/png;base64,", "")
|
||||
image_data = base64.b64decode(base64_str)
|
||||
image = Image.open(BytesIO(image_data))
|
||||
|
||||
return image
|
||||
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
|
||||
gemini_messages = []
|
||||
for i, message in enumerate(messages):
|
||||
gemini_message = {
|
||||
"role": message["role"],
|
||||
"parts": []
|
||||
}
|
||||
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
|
||||
|
||||
# The gemini only support the last image as single image input
|
||||
if i == len(messages) - 1:
|
||||
for part in message["content"]:
|
||||
gemini_message['parts'].append(part['text']) if part['type'] == "text" \
|
||||
else gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url']))
|
||||
else:
|
||||
for part in message["content"]:
|
||||
gemini_message['parts'].append(part['text']) if part['type'] == "text" else None
|
||||
|
||||
gemini_messages.append(gemini_message)
|
||||
|
||||
response = genai.GenerativeModel(self.model).generate_content(
|
||||
gemini_messages,
|
||||
generation_config={
|
||||
"max_output_tokens": max_tokens
|
||||
}
|
||||
)
|
||||
|
||||
try:
|
||||
return response.text
|
||||
except Exception as e:
|
||||
return ""
|
||||
elif self.model.startswith("qwen"):
|
||||
messages = payload["messages"]
|
||||
max_tokens = payload["max_tokens"]
|
||||
|
||||
qwen_messages = []
|
||||
|
||||
for i, message in enumerate(messages):
|
||||
qwen_message = {
|
||||
"role": message["role"],
|
||||
"content": []
|
||||
}
|
||||
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
|
||||
for part in message["content"]:
|
||||
qwen_message['content'].append({"image": part['image_url']['url']}) if part['type'] == "image_url" else None
|
||||
qwen_message['content'].append({"text": part['text']}) if part['type'] == "text" else None
|
||||
|
||||
qwen_messages.append(qwen_message)
|
||||
|
||||
response = dashscope.MultiModalConversation.call(model='qwen-vl-plus',
|
||||
messages=messages)
|
||||
# The response status_code is HTTPStatus.OK indicate success,
|
||||
# otherwise indicate request is failed, you can get error code
|
||||
# and message from code and message.
|
||||
if response.status_code == HTTPStatus.OK:
|
||||
try:
|
||||
return response.json()['output']['choices'][0]['message']['content']
|
||||
except Exception as e:
|
||||
return ""
|
||||
else:
|
||||
print(response.code) # The error code.
|
||||
print(response.message) # The error message.
|
||||
return ""
|
||||
|
||||
print("Failed to call LLM: " + response.text)
|
||||
return ""
|
||||
else:
|
||||
return response.json()['choices'][0]['message']['content']
|
||||
raise ValueError("Invalid model: " + self.model)
|
||||
|
||||
def parse_actions(self, response: str, masks=None):
|
||||
|
||||
|
||||
@@ -806,9 +806,9 @@ For each step, you will get an observation of the desktop by 1) a screenshot; an
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||
```python
|
||||
pyautogui.moveTo(tag#3)
|
||||
pyautogui.click(tag#2)
|
||||
pyautogui.dragTo(tag#1, button='left')
|
||||
pyautogui.moveTo(tag_3)
|
||||
pyautogui.click(tag_2)
|
||||
pyautogui.dragTo(tag_1, button='left')
|
||||
```
|
||||
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
|
||||
But you should be careful to ensure that the coordinates are correct.
|
||||
@@ -856,9 +856,9 @@ ACTION_GROUNDING_PROMPT_SEEACT = """
|
||||
You are required to use `pyautogui` to perform the action grounded to the observation, but DONOT use the `pyautogui.locateCenterOnScreen` function to locate the element you want to operate with since we have no image of the element you want to operate with. DONOT USE `pyautogui.screenshot()` to make screenshot.
|
||||
You can replace x, y in the code with the tag of the element you want to operate with. such as:
|
||||
```python
|
||||
pyautogui.moveTo(tag#3)
|
||||
pyautogui.click(tag#2)
|
||||
pyautogui.dragTo(tag#1, button='left')
|
||||
pyautogui.moveTo(tag_3)
|
||||
pyautogui.click(tag_2)
|
||||
pyautogui.dragTo(tag_1, button='left')
|
||||
```
|
||||
When you think you can directly output precise x and y coordinates or there is no tag on which you want to interact, you can also use them directly.
|
||||
But you should be careful to ensure that the coordinates are correct.
|
||||
|
||||
Reference in New Issue
Block a user