diff --git a/mm_agents/agent.py b/mm_agents/agent.py index 85db78b..4314c63 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -10,16 +10,10 @@ from http import HTTPStatus from io import BytesIO from typing import Dict, List -import backoff import dashscope import google.generativeai as genai import requests from PIL import Image -from vertexai.preview.generative_models import ( - HarmBlockThreshold, - HarmCategory, - Image, -) from mm_agents.accessibility_tree_wrap.heuristic_retrieve import find_leaf_nodes, filter_nodes, draw_bounding_boxes from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_SCREENSHOT_OUT_ACTION, \ @@ -28,8 +22,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S SYS_PROMPT_IN_SOM_A11Y_OUT_TAG, \ SYS_PROMPT_SEEACT, ACTION_DESCRIPTION_PROMPT_SEEACT, ACTION_GROUNDING_PROMPT_SEEACT -# todo: cross-check with visualwebarena - logger = logging.getLogger("desktopenv.agent") @@ -43,7 +35,7 @@ def linearize_accessibility_tree(accessibility_tree): # leaf_nodes = find_leaf_nodes(accessibility_tree) filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree)) - linearized_accessibility_tree = "tag\tname\ttext\tposition\tsize\n" + linearized_accessibility_tree = "tag\tname\ttext\tposition (top-left x&y)\tsize (w&h)\n" # Linearize the accessibility tree nodes into a table format for node in filtered_nodes: @@ -205,7 +197,7 @@ class PromptAgent: self.system_message = SYS_PROMPT_IN_A11Y_OUT_CODE else: raise ValueError("Invalid action space: " + action_space) - elif observation_type == "both": + elif observation_type == "screenshot_a11y_tree": if action_space == "computer_13": self.system_message = SYS_PROMPT_IN_BOTH_OUT_ACTION elif action_space == "pyautogui": @@ -233,8 +225,7 @@ class PromptAgent: """ Predict the next action(s) based on the current observation. """ - self.system_message = self.system_message + "\nYou are asked to complete the following task: {}".format( - instruction) + system_message = self.system_message + "\nYou are asked to complete the following task: {}".format(instruction) # Prepare the payload for the API call messages = [] @@ -245,7 +236,7 @@ class PromptAgent: "content": [ { "type": "text", - "text": self.system_message + "text": system_message }, ] }) @@ -266,7 +257,7 @@ class PromptAgent: for previous_obs, previous_action, previous_thought in zip(_observations, _actions, _thoughts): # {{{1 - if self.observation_type == "both": + if self.observation_type == "screenshot_a11y_tree": _screenshot = previous_obs["screenshot"] _linearized_accessibility_tree = previous_obs["accessibility_tree"] logger.debug("LINEAR AT: %s", _linearized_accessibility_tree) @@ -356,11 +347,11 @@ class PromptAgent: }) # {{{1 - if self.observation_type in ["screenshot", "both"]: + if self.observation_type in ["screenshot", "screenshot_a11y_tree"]: base64_image = encode_image(obs["screenshot"]) linearized_accessibility_tree = linearize_accessibility_tree(accessibility_tree=obs["accessibility_tree"]) - if self.observation_type == "both": + if self.observation_type == "screenshot_a11y_tree": self.observations.append({ "screenshot": base64_image, "accessibility_tree": linearized_accessibility_tree @@ -473,7 +464,9 @@ class PromptAgent: response = self.call_llm({ "model": self.model, "messages": messages, - "max_tokens": self.max_tokens + "max_tokens": self.max_tokens, + "top_p": self.top_p, + "temperature": self.temperature }) logger.info("RESPONSE: %s", response) @@ -520,11 +513,11 @@ class PromptAgent: return actions - @backoff.on_exception( - backoff.expo, - (Exception), - max_tries=5 - ) + # @backoff.on_exception( + # backoff.expo, + # (Exception), + # max_tries=5 + # ) def call_llm(self, payload): if self.model.startswith("gpt"): @@ -542,14 +535,14 @@ class PromptAgent: if response.status_code != 200: if response.json()['error']['code'] == "context_length_exceeded": logger.error("Context length exceeded. Retrying with a smaller context.") - payload["messages"] = payload["messages"][-1:] + payload["messages"] = [payload["messages"][0]] + payload["messages"][-1:] retry_response = requests.post( "https://api.openai.com/v1/chat/completions", headers=headers, json=payload ) if retry_response.status_code != 200: - logger.error("Failed to call LLM: " + retry_response.text) + logger.error("Failed to call LLM even after attempt on shortening the history: " + retry_response.text) return "" logger.error("Failed to call LLM: " + response.text) @@ -656,8 +649,9 @@ class PromptAgent: for message in gemini_messages: message_history_str += "<|" + message['role'] + "|>\n" + message['parts'][0] + "\n" gemini_messages = [{"role": "user", "parts": [message_history_str, gemini_messages[-1]['parts'][1]]}] + # gemini_messages[-1]['parts'][1].save("output.png", "PNG") - print(gemini_messages) + # print(gemini_messages) api_key = os.environ.get("GENAI_API_KEY") assert api_key is not None, "Please set the GENAI_API_KEY environment variable" genai.configure(api_key=api_key) @@ -671,11 +665,10 @@ class PromptAgent: "temperature": temperature }, safety_settings={ - HarmCategory.HARM_CATEGORY_UNSPECIFIED: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HATE_SPEECH: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_DANGEROUS_CONTENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_HARASSMENT: HarmBlockThreshold.BLOCK_NONE, - HarmCategory.HARM_CATEGORY_SEXUALLY_EXPLICIT: HarmBlockThreshold.BLOCK_NONE, + "harassment": "block_none", + "hate": "block_none", + "sex": "block_none", + "danger": "block_none" } ) @@ -726,7 +719,7 @@ class PromptAgent: def parse_actions(self, response: str, masks=None): - if self.observation_type in ["screenshot", "a11y_tree", "both"]: + if self.observation_type in ["screenshot", "a11y_tree", "screenshot_a11y_tree"]: # parse from the response if self.action_space == "computer_13": actions = parse_actions_from_string(response) diff --git a/run.py b/run.py index 908d479..04aec2c 100644 --- a/run.py +++ b/run.py @@ -66,7 +66,7 @@ def config() -> argparse.Namespace: "screenshot_a11y_tree", "som" ], - default="a11y_tree", + default="som", help="Observation type", ) parser.add_argument("--screen_width", type=int, default=1920) @@ -146,6 +146,7 @@ def test( step_idx = 0 env.controller.start_recording() + # todo: update max running time for each example, @xiaochuan while not done and step_idx < max_steps: actions = agent.predict( instruction, @@ -158,7 +159,7 @@ def test( action_timestamp = datetime.datetime.now().strftime("%Y%m%d@%H%M%S") logger.info("Step %d: %s", step_idx + 1, action) - observation, reward, done, info = env.step(action, args.sleep_after_execution) + obs, reward, done, info = env.step(action, args.sleep_after_execution) logger.info("Reward: %.2f", reward) logger.info("Done: %s", done) @@ -167,7 +168,7 @@ def test( # Save screenshot and trajectory information with open(os.path.join(example_result_dir, f"step_{step_idx + 1}_{action_timestamp}.png"), "wb") as _f: - with open(observation['screenshot'], "rb") as __f: + with open(obs['screenshot'], "rb") as __f: screenshot = __f.read() _f.write(screenshot) @@ -186,22 +187,24 @@ def test( if done: logger.info("The episode is done.") break - - result = env.evaluate() + try: + result = env.evaluate() + except Exception as e: + logger.error(f"Error in evaluating the example {example_id}: {e}") + result = 0.0 logger.info("Result: %.2f", result) - scores.append(result) env.controller.end_recording(os.path.join(example_result_dir, "recording.mp4")) + scores.append(result) + with open(os.path.join(example_result_dir, "result.txt"), "w", encoding="utf-8") as f: + f.write(f"{result}\n") env.close() logger.info(f"Average score: {sum(scores) / len(scores)}") -def get_unfinished(test_file_list, result_dir): - finished = [] - for domain in os.listdir(result_dir): - for example_id in os.listdir(os.path.join(result_dir, domain)): - finished.append(f"{domain}/{example_id}") - return [x for x in test_file_list if x not in finished] +def get_unfinished(test, result_dir): + # todo @xiaochuan + pass if __name__ == '__main__':