Fix bugs
This commit is contained in:
@@ -10,16 +10,10 @@ from http import HTTPStatus
|
||||
from io import BytesIO
|
||||
from typing import Dict, List
|
||||
|
||||
import backoff
|
||||
import dashscope
|
||||
import google.generativeai as genai
|
||||
import requests
|
||||
from PIL import Image
|
||||
from vertexai.preview.generative_models import (
|
||||
HarmBlockThreshold,
|
||||
HarmCategory,
|
||||
Image,
|
||||
)
|
||||
|
||||
from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes
|
||||
from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \
|
||||
@@ -28,8 +22,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S
|
||||
SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \
|
||||
SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT
|
||||
|
||||
# todo: cross-check with visualwebarena
|
||||
|
||||
logger = logging.getLogger("desktopenv.agent")
|
||||
|
||||
|
||||
@@ -43,7 +35,7 @@ def linearize_accessibility_tree(accessibility_tree):
|
||||
# leaf_nodes = find_leaf_nodes(accessibility_tree)
|
||||
filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree))
|
||||
|
||||
linearized_accessibility_tree = "tag\tname\ttext\tposition\tsize\n"
|
||||
linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n"
|
||||
# Linearize the accessibility tree nodes into a table format
|
||||
|
||||
for node in filtered_nodes:
|
||||
@@ -205,7 +197,7 @@ class PromptAgent:
|
||||
self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE
|
||||
else:
|
||||
raise ValueError("Invalid action space: " + action_space)
|
||||
elif observation_type == "both":
|
||||
elif observation_type == "screenshot_a11y_tree":
|
||||
if action_space == "computer_13":
|
||||
self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION
|
||||
elif action_space == "pyautogui":
|
||||
@@ -233,8 +225,7 @@ class PromptAgent:
|
||||
"""
|
||||
Predict the next action(s) based on the current observation.
|
||||
"""
|
||||
self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(
|
||||
instruction)
|
||||
system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction)
|
||||
|
||||
# Prepare the payload for the API call
|
||||
messages = []
|
||||
@@ -245,7 +236,7 @@ class PromptAgent:
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": self.system_message
|
||||
"text": system_message
|
||||
},
|
||||
]
|
||||
})
|
||||
@@ -266,7 +257,7 @@ class PromptAgent:
|
||||
for previous_obs, previous_action, previous_thought in zip(_observations, _actions, _thoughts):
|
||||
|
||||
# {{{1
|
||||
if self.observation_type == "both":
|
||||
if self.observation_type == "screenshot_a11y_tree":
|
||||
_screenshot = previous_obs["screenshot"]
|
||||
_linearized_accessibility_tree = previous_obs["accessibility_tree"]
|
||||
logger.debug("LINEAR AT: %s", _linearized_accessibility_tree)
|
||||
@@ -356,11 +347,11 @@ class PromptAgent:
|
||||
})
|
||||
|
||||
# {{{1
|
||||
if self.observation_type in ["screenshot", "both"]:
|
||||
if self.observation_type in ["screenshot", "screenshot_a11y_tree"]:
|
||||
base64_image = encode_image(obs["screenshot"])
|
||||
linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"])
|
||||
|
||||
if self.observation_type == "both":
|
||||
if self.observation_type == "screenshot_a11y_tree":
|
||||
self.observations.append({
|
||||
"screenshot": base64_image,
|
||||
"accessibility_tree": linearized_accessibility_tree
|
||||
@@ -473,7 +464,9 @@ class PromptAgent:
|
||||
response = self.call_llm({
|
||||
"model": self.model,
|
||||
"messages": messages,
|
||||
"max_tokens": self.max_tokens
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"temperature": self.temperature
|
||||
})
|
||||
|
||||
logger.info("RESPONSE: %s", response)
|
||||
@@ -520,11 +513,11 @@ class PromptAgent:
|
||||
|
||||
return actions
|
||||
|
||||
@backoff.on_exception(
|
||||
backoff.expo,
|
||||
(Exception),
|
||||
max_tries=5
|
||||
)
|
||||
# @backoff.on_exception(
|
||||
# backoff.expo,
|
||||
# (Exception),
|
||||
# max_tries=5
|
||||
# )
|
||||
def call_llm(self, payload):
|
||||
|
||||
if self.model.startswith("gpt"):
|
||||
@@ -542,14 +535,14 @@ class PromptAgent:
|
||||
if response.status_code != 200:
|
||||
if response.json()['error']['code'] == "context_length_exceeded":
|
||||
logger.error("Context length exceeded. Retrying with a smaller context.")
|
||||
payload["messages"] = payload["messages"][-1:]
|
||||
payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:]
|
||||
retry_response = requests.post(
|
||||
"https://api.openai.com/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload
|
||||
)
|
||||
if retry_response.status_code != 200:
|
||||
logger.error("Failed to call LLM: " + retry_response.text)
|
||||
logger.error("Failed to call LLM even after attempt on shortening the history: " + retry_response.text)
|
||||
return ""
|
||||
|
||||
logger.error("Failed to call LLM: " + response.text)
|
||||
@@ -656,8 +649,9 @@ class PromptAgent:
|
||||
for message in gemini_messages:
|
||||
message_history_str += "<|" + message['role'] + "|>\n" + message['parts'][0] + "\n"
|
||||
gemini_messages = [{"role": "user", "parts": [message_history_str, gemini_messages[-1]['parts'][1]]}]
|
||||
# gemini_messages[-1]['parts'][1].save("output.png", "PNG")
|
||||
|
||||
print(gemini_messages)
|
||||
# print(gemini_messages)
|
||||
api_key = os.environ.get("GENAI_API_KEY")
|
||||
assert api_key is not None, "Please set the GENAI_API_KEY environment variable"
|
||||
genai.configure(api_key=api_key)
|
||||
@@ -671,11 +665,10 @@ class PromptAgent:
|
||||
"temperature": temperature
|
||||
},
|
||||
safety_settings={
|
||||
HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE,
|
||||
HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE,
|
||||
HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE,
|
||||
HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE,
|
||||
HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE,
|
||||
"harassment": "block_none",
|
||||
"hate": "block_none",
|
||||
"sex": "block_none",
|
||||
"danger": "block_none"
|
||||
}
|
||||
)
|
||||
|
||||
@@ -726,7 +719,7 @@ class PromptAgent:
|
||||
|
||||
def parse_actions(self, response: str, masks=None):
|
||||
|
||||
if self.observation_type in ["screenshot", "a11y_tree", "both"]:
|
||||
if self.observation_type in ["screenshot", "a11y_tree", "screenshot_a11y_tree"]:
|
||||
# parse from the response
|
||||
if self.action_space == "computer_13":
|
||||
actions = parse_actions_from_string(response)
|
||||
|
||||
27
run.py
27
run.py
@@ -66,7 +66,7 @@ def config() -> argparse.Namespace:
|
||||
"screenshot_a11y_tree",
|
||||
"som"
|
||||
],
|
||||
default="a11y_tree",
|
||||
default="som",
|
||||
help="Observation type",
|
||||
)
|
||||
parser.add_argument("--screen_width", type=int, default=1920)
|
||||
@@ -146,6 +146,7 @@ def test(
|
||||
step_idx = 0
|
||||
env.controller.start_recording()
|
||||
|
||||
# todo: update max running time for each example, @xiaochuan
|
||||
while not done and step_idx < max_steps:
|
||||
actions = agent.predict(
|
||||
instruction,
|
||||
@@ -158,7 +159,7 @@ def test(
|
||||
action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S")
|
||||
logger.info("Step %d: %s", step_idx + 1, action)
|
||||
|
||||
observation, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
obs, reward, done, info = env.step(action, args.sleep_after_execution)
|
||||
|
||||
logger.info("Reward: %.2f", reward)
|
||||
logger.info("Done: %s", done)
|
||||
@@ -167,7 +168,7 @@ def test(
|
||||
# Save screenshot and trajectory information
|
||||
with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"),
|
||||
"wb") as _f:
|
||||
with open(observation['screenshot'], "rb") as __f:
|
||||
with open(obs['screenshot'], "rb") as __f:
|
||||
screenshot = __f.read()
|
||||
_f.write(screenshot)
|
||||
|
||||
@@ -186,22 +187,24 @@ def test(
|
||||
if done:
|
||||
logger.info("The episode is done.")
|
||||
break
|
||||
|
||||
result = env.evaluate()
|
||||
try:
|
||||
result = env.evaluate()
|
||||
except Exception as e:
|
||||
logger.error(f"Error in evaluating the example {example_id}: {e}")
|
||||
result = 0.0
|
||||
logger.info("Result: %.2f", result)
|
||||
scores.append(result)
|
||||
env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4"))
|
||||
scores.append(result)
|
||||
with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f:
|
||||
f.write(f"{result}\n")
|
||||
|
||||
env.close()
|
||||
logger.info(f"Average score: {sum(scores) / len(scores)}")
|
||||
|
||||
|
||||
def get_unfinished(test_file_list, result_dir):
|
||||
finished = []
|
||||
for domain in os.listdir(result_dir):
|
||||
for example_id in os.listdir(os.path.join(result_dir, domain)):
|
||||
finished.append(f"{domain}/{example_id}")
|
||||
return [x for x in test_file_list if x not in finished]
|
||||
def get_unfinished(test, result_dir):
|
||||
# todo @xiaochuan
|
||||
pass
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
Reference in New Issue
Block a user