Update README and ROADMAP; Fix typos; optimize the code for llm calling in agent.py
This commit is contained in:
13
README.md
13
README.md
@@ -63,6 +63,7 @@ pip install desktop-env
|
|||||||
vmrun -T ws list
|
vmrun -T ws list
|
||||||
```
|
```
|
||||||
If the installation along with the environment variable set is successful, you will see the message showing the current running virtual machines.
|
If the installation along with the environment variable set is successful, you will see the message showing the current running virtual machines.
|
||||||
|
> **Note:** We will also support using [VirtualBox](https://www.virtualbox.org/) in the near future if you have issues with VMware Pro. However, features such as parallelism and macOS on Apple chips are not supported.
|
||||||
|
|
||||||
All set! Our setup script will automatically download the necessary virtual machines and configure the environment for you.
|
All set! Our setup script will automatically download the necessary virtual machines and configure the environment for you.
|
||||||
|
|
||||||
@@ -135,6 +136,14 @@ Correctly implement the agent interface and import your customized version in th
|
|||||||
Afterward, you can execute a command similar to the one in the previous section to run the benchmark on your agent.
|
Afterward, you can execute a command similar to the one in the previous section to run the benchmark on your agent.
|
||||||
|
|
||||||
## ❓ FAQ
|
## ❓ FAQ
|
||||||
|
### What is the username and password for the virtual machines?
|
||||||
|
The username and password for the virtual machines are as follows:
|
||||||
|
- **Ubuntu:** `user` / `password`
|
||||||
|
|
||||||
|
### How can I configure a proxy for the VM if I'm behind a GFW?
|
||||||
|
|
||||||
|
See [Proxy Guideline](PROXY_GUIDELINE.md).
|
||||||
|
|
||||||
### What are the running times and costs under different settings?
|
### What are the running times and costs under different settings?
|
||||||
| Setting | Expected Time* | Budget Cost (Full Test Set/Small Test Set) |
|
| Setting | Expected Time* | Budget Cost (Full Test Set/Small Test Set) |
|
||||||
| ------------------------------ | -------------- | ------------------------------------------ |
|
| ------------------------------ | -------------- | ------------------------------------------ |
|
||||||
@@ -145,10 +154,6 @@ Afterward, you can execute a command similar to the one in the previous section
|
|||||||
|
|
||||||
\*No environment parallelism. Calculated in April 2024.
|
\*No environment parallelism. Calculated in April 2024.
|
||||||
|
|
||||||
### How can I configure a proxy for the VM if I'm behind a GFW?
|
|
||||||
|
|
||||||
See [Proxy Guideline](PROXY_GUIDELINE.md)
|
|
||||||
|
|
||||||
## 📄 Citation
|
## 📄 Citation
|
||||||
If you find this environment useful, please consider citing our work:
|
If you find this environment useful, please consider citing our work:
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ If you are interested in contributing to the project, please check the [CONTRIBU
|
|||||||
- [ ] VPN setup doc for those who need it
|
- [ ] VPN setup doc for those who need it
|
||||||
- [ ] Support running on platforms that have nested virtualization, e.g. Google Cloud, AWS, etc.
|
- [ ] Support running on platforms that have nested virtualization, e.g. Google Cloud, AWS, etc.
|
||||||
- [ ] Prepare for the first release of Windows vm image for the environment
|
- [ ] Prepare for the first release of Windows vm image for the environment
|
||||||
- [ ] Be able to run without virtual machine platform VMware
|
- [ ] Be able to run without virtual machine platform VMware Pro, e.g. VirtualBox, or other platforms
|
||||||
|
|
||||||
|
|
||||||
## Road Map of Annotation Tool
|
## Road Map of Annotation Tool
|
||||||
|
|||||||
@@ -242,7 +242,7 @@ class SetupController:
|
|||||||
until: Optional[Dict[str, Any]] = None
|
until: Optional[Dict[str, Any]] = None
|
||||||
):
|
):
|
||||||
if not command:
|
if not command:
|
||||||
raise Exception("Empty comman to launch.")
|
raise Exception("Empty command to launch.")
|
||||||
|
|
||||||
until: Dict[str, Any] = until or {}
|
until: Dict[str, Any] = until or {}
|
||||||
terminates: bool = False
|
terminates: bool = False
|
||||||
|
|||||||
@@ -1,5 +1,4 @@
|
|||||||
import base64
|
import base64
|
||||||
import hashlib
|
|
||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os
|
import os
|
||||||
@@ -9,7 +8,6 @@ import time
|
|||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from http import HTTPStatus
|
from http import HTTPStatus
|
||||||
from io import BytesIO
|
from io import BytesIO
|
||||||
from pathlib import Path
|
|
||||||
from typing import Dict, List
|
from typing import Dict, List
|
||||||
|
|
||||||
import backoff
|
import backoff
|
||||||
@@ -19,7 +17,7 @@ import openai
|
|||||||
import requests
|
import requests
|
||||||
import tiktoken
|
import tiktoken
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
from google.api_core.exceptions import InvalidArgument
|
from google.api_core.exceptions import InvalidArgument, ResourceExhausted, InternalServerError, BadRequest
|
||||||
|
|
||||||
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import filter_nodes, draw_bounding_boxes
|
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import filter_nodes, draw_bounding_boxes
|
||||||
from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
|
from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
|
||||||
@@ -487,13 +485,17 @@ class PromptAgent:
|
|||||||
|
|
||||||
# logger.info("PROMPT: %s", messages)
|
# logger.info("PROMPT: %s", messages)
|
||||||
|
|
||||||
response = self.call_llm({
|
try:
|
||||||
"model": self.model,
|
response = self.call_llm({
|
||||||
"messages": messages,
|
"model": self.model,
|
||||||
"max_tokens": self.max_tokens,
|
"messages": messages,
|
||||||
"top_p": self.top_p,
|
"max_tokens": self.max_tokens,
|
||||||
"temperature": self.temperature
|
"top_p": self.top_p,
|
||||||
})
|
"temperature": self.temperature
|
||||||
|
})
|
||||||
|
except Exception as e:
|
||||||
|
logger.error("Failed to call" + self.model + ", Error: " + str(e))
|
||||||
|
response = ""
|
||||||
|
|
||||||
logger.info("RESPONSE: %s", response)
|
logger.info("RESPONSE: %s", response)
|
||||||
|
|
||||||
@@ -512,10 +514,18 @@ class PromptAgent:
|
|||||||
# here you should add more model exceptions as you want,
|
# here you should add more model exceptions as you want,
|
||||||
# but you are forbidden to add "Exception", that is, a common type of exception
|
# but you are forbidden to add "Exception", that is, a common type of exception
|
||||||
# because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
|
# because we want to catch this kind of Exception in the outside to ensure each example won't exceed the time limit
|
||||||
(openai.RateLimitError,
|
(
|
||||||
openai.BadRequestError,
|
# OpenAI exceptions
|
||||||
openai.InternalServerError,
|
openai.RateLimitError,
|
||||||
InvalidArgument),
|
openai.BadRequestError,
|
||||||
|
openai.InternalServerError,
|
||||||
|
|
||||||
|
# Google exceptions
|
||||||
|
InvalidArgument,
|
||||||
|
ResourceExhausted,
|
||||||
|
InternalServerError,
|
||||||
|
BadRequest,
|
||||||
|
),
|
||||||
max_tries=5
|
max_tries=5
|
||||||
)
|
)
|
||||||
def call_llm(self, payload):
|
def call_llm(self, payload):
|
||||||
@@ -767,29 +777,25 @@ class PromptAgent:
|
|||||||
logger.info("Generating content with Gemini model: %s", self.model)
|
logger.info("Generating content with Gemini model: %s", self.model)
|
||||||
request_options = {"timeout": 120}
|
request_options = {"timeout": 120}
|
||||||
gemini_model = genai.GenerativeModel(self.model)
|
gemini_model = genai.GenerativeModel(self.model)
|
||||||
try:
|
|
||||||
response = gemini_model.generate_content(
|
response = gemini_model.generate_content(
|
||||||
gemini_messages,
|
gemini_messages,
|
||||||
generation_config={
|
generation_config={
|
||||||
"candidate_count": 1,
|
"candidate_count": 1,
|
||||||
"max_output_tokens": max_tokens,
|
"max_output_tokens": max_tokens,
|
||||||
"top_p": top_p,
|
"top_p": top_p,
|
||||||
"temperature": temperature
|
"temperature": temperature
|
||||||
},
|
},
|
||||||
safety_settings={
|
safety_settings={
|
||||||
"harassment": "block_none",
|
"harassment": "block_none",
|
||||||
"hate": "block_none",
|
"hate": "block_none",
|
||||||
"sex": "block_none",
|
"sex": "block_none",
|
||||||
"danger": "block_none"
|
"danger": "block_none"
|
||||||
},
|
},
|
||||||
request_options=request_options
|
request_options=request_options
|
||||||
)
|
)
|
||||||
return response.text
|
return response.text
|
||||||
except Exception as e:
|
|
||||||
logger.error("Meet exception when calling Gemini API, " + str(e.__class__.__name__) + str(e))
|
|
||||||
logger.error(f"count_tokens: {gemini_model.count_tokens(gemini_messages)}")
|
|
||||||
logger.error(f"generation_config: {max_tokens}, {top_p}, {temperature}")
|
|
||||||
return ""
|
|
||||||
|
|
||||||
elif self.model == "gemini-1.5-pro-latest":
|
elif self.model == "gemini-1.5-pro-latest":
|
||||||
messages = payload["messages"]
|
messages = payload["messages"]
|
||||||
@@ -797,19 +803,6 @@ class PromptAgent:
|
|||||||
top_p = payload["top_p"]
|
top_p = payload["top_p"]
|
||||||
temperature = payload["temperature"]
|
temperature = payload["temperature"]
|
||||||
|
|
||||||
uploaded_files = []
|
|
||||||
|
|
||||||
# def upload_if_needed(pathname: str) -> list[str]:
|
|
||||||
# path = Path(pathname)
|
|
||||||
# hash_id = hashlib.sha256(path.read_bytes()).hexdigest()
|
|
||||||
# try:
|
|
||||||
# existing_file = genai.get_file(name=hash_id)
|
|
||||||
# return [existing_file.uri]
|
|
||||||
# except:
|
|
||||||
# pass
|
|
||||||
# uploaded_files.append(genai.upload_file(path=path, display_name=hash_id))
|
|
||||||
# return [uploaded_files[-1].uri]
|
|
||||||
|
|
||||||
gemini_messages = []
|
gemini_messages = []
|
||||||
for i, message in enumerate(messages):
|
for i, message in enumerate(messages):
|
||||||
role_mapping = {
|
role_mapping = {
|
||||||
@@ -818,21 +811,23 @@ class PromptAgent:
|
|||||||
"system": "system"
|
"system": "system"
|
||||||
}
|
}
|
||||||
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
|
assert len(message["content"]) in [1, 2], "One text, or one text with one image"
|
||||||
|
gemini_message = {
|
||||||
|
"role": role_mapping[message["role"]],
|
||||||
|
"parts": []
|
||||||
|
}
|
||||||
|
|
||||||
# The gemini only support the last image as single image input
|
# The gemini only support the last image as single image input
|
||||||
for part in message["content"]:
|
for part in message["content"]:
|
||||||
gemini_message = {
|
|
||||||
"role": role_mapping[message["role"]],
|
|
||||||
"parts": []
|
|
||||||
}
|
|
||||||
if part['type'] == "image_url":
|
if part['type'] == "image_url":
|
||||||
gemini_message['parts'].append(encoded_img_to_pil_img(part['image_url']['url']))
|
# Put the image at the beginning of the message
|
||||||
|
gemini_message['parts'].insert(0, encoded_img_to_pil_img(part['image_url']['url']))
|
||||||
elif part['type'] == "text":
|
elif part['type'] == "text":
|
||||||
gemini_message['parts'].append(part['text'])
|
gemini_message['parts'].append(part['text'])
|
||||||
else:
|
else:
|
||||||
raise ValueError("Invalid content type: " + part['type'])
|
raise ValueError("Invalid content type: " + part['type'])
|
||||||
|
|
||||||
gemini_messages.append(gemini_message)
|
gemini_messages.append(gemini_message)
|
||||||
|
|
||||||
# the system message of gemini-1.5-pro-latest need to be inputted through model initialization parameter
|
# the system message of gemini-1.5-pro-latest need to be inputted through model initialization parameter
|
||||||
system_instruction = None
|
system_instruction = None
|
||||||
@@ -849,33 +844,34 @@ class PromptAgent:
|
|||||||
self.model,
|
self.model,
|
||||||
system_instruction=system_instruction
|
system_instruction=system_instruction
|
||||||
)
|
)
|
||||||
try:
|
|
||||||
response = gemini_model.generate_content(
|
with open("response.json", "w") as f:
|
||||||
gemini_messages,
|
messages_to_save = []
|
||||||
generation_config={
|
for message in gemini_messages:
|
||||||
"candidate_count": 1,
|
messages_to_save.append({
|
||||||
"max_output_tokens": max_tokens,
|
"role": message["role"],
|
||||||
"top_p": top_p,
|
"content": [part if isinstance(part, str) else "image" for part in message["parts"]]
|
||||||
"temperature": temperature
|
})
|
||||||
},
|
json.dump(messages_to_save, f, indent=4)
|
||||||
safety_settings={
|
|
||||||
"harassment": "block_none",
|
response = gemini_model.generate_content(
|
||||||
"hate": "block_none",
|
gemini_messages,
|
||||||
"sex": "block_none",
|
generation_config={
|
||||||
"danger": "block_none"
|
"candidate_count": 1,
|
||||||
},
|
"max_output_tokens": max_tokens,
|
||||||
request_options=request_options
|
"top_p": top_p,
|
||||||
)
|
"temperature": temperature
|
||||||
for uploaded_file in uploaded_files:
|
},
|
||||||
genai.delete_file(name=uploaded_file.name)
|
safety_settings={
|
||||||
return response.text
|
"harassment": "block_none",
|
||||||
except Exception as e:
|
"hate": "block_none",
|
||||||
logger.error("Meet exception when calling Gemini API, " + str(e.__class__.__name__) + str(e))
|
"sex": "block_none",
|
||||||
logger.error(f"count_tokens: {gemini_model.count_tokens(gemini_messages)}")
|
"danger": "block_none"
|
||||||
logger.error(f"generation_config: {max_tokens}, {top_p}, {temperature}")
|
},
|
||||||
for uploaded_file in uploaded_files:
|
request_options=request_options
|
||||||
genai.delete_file(name=uploaded_file.name)
|
)
|
||||||
return ""
|
|
||||||
|
return response.text
|
||||||
|
|
||||||
elif self.model.startswith("qwen"):
|
elif self.model.startswith("qwen"):
|
||||||
messages = payload["messages"]
|
messages = payload["messages"]
|
||||||
|
|||||||
Reference in New Issue
Block a user