From 20b1d950a0b06de827e809eec548929992e20a6e Mon Sep 17 00:00:00 2001 From: Timothyxxx <384084775@qq.com> Date: Tue, 16 Jan 2024 22:00:01 +0800 Subject: [PATCH] FIx corner cases (val connection in chrome when using playwright, and action parsing for agent, and accessibility tree xml handling) --- desktop_env/evaluators/getters/chrome.py | 16 ++++++++--- experiment_pure_text.py | 14 ++++++---- .../heuristic_retrieve.py | 23 ++++------------ mm_agents/gpt_4_agent.py | 27 ++++++++++++++++--- mm_agents/gui_som/READAME.md | 1 + 5 files changed, 51 insertions(+), 30 deletions(-) create mode 100644 mm_agents/gui_som/READAME.md diff --git a/desktop_env/evaluators/getters/chrome.py b/desktop_env/evaluators/getters/chrome.py index 1b77016..d58d08b 100644 --- a/desktop_env/evaluators/getters/chrome.py +++ b/desktop_env/evaluators/getters/chrome.py @@ -159,9 +159,19 @@ def get_open_tabs_info(env, config: Dict[str, str]): tabs_info = [] for context in browser.contexts: for page in context.pages: - title = page.title() - url = page.url - tabs_info.append({'title': title, 'url': url}) + try: + # Wait for the page to finish loading, this prevents the "execution context was destroyed" issue + page.wait_for_load_state('load') # Wait for the 'load' event to complete + title = page.title() + url = page.url + tabs_info.append({'title': title, 'url': url}) + except TimeoutError: + # If page loading times out, catch the exception and store the current information in the list + tabs_info.append({'title': 'Load timeout', 'url': page.url}) + except Exception as e: + # Catch other potential exceptions that might occur while reading the page title + print(f'Error: {e}') + tabs_info.append({'title': 'Error encountered', 'url': page.url}) browser.close() return tabs_info diff --git a/experiment_pure_text.py b/experiment_pure_text.py index 4ab5c97..cfcbd46 100644 --- a/experiment_pure_text.py +++ b/experiment_pure_text.py @@ -111,21 +111,25 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr if __name__ == "__main__": action_space = "pyautogui" example_class = "chrome" - example_id = "06fe7178-4491-4589-810f-2e2bc9502122" + example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263" + gpt4_model = "gpt-4-1106-preview" + gemini_model = "gemini-pro-vision" with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f: example = json.load(f) example["snapshot"] = "exp_setup4" api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space) # api_key = os.environ.get("GENAI_API_KEY") - # agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space) + # agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space) root_trajectory_dir = "exp_trajectory" - example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id) + example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id) + # example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id) + os.makedirs(example_trajectory_dir, exist_ok=True) - run_one_example(example, agent, 10, example_trajectory_dir) + run_one_example(example, agent, 15, example_trajectory_dir) diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index 7e4a74e..d6f83eb 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -3,8 +3,11 @@ import xml.etree.ElementTree as ET from PIL import Image, ImageDraw, ImageFont -def find_leaf_nodes(xml_file_path): - root = ET.fromstring(xml_file_path) +def find_leaf_nodes(xlm_file_str): + if not xlm_file_str: + return [] + + root = ET.fromstring(xlm_file_str) # Recursive function to traverse the XML tree and collect leaf nodes def collect_leaf_nodes(node, leaf_nodes): @@ -97,19 +100,3 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): # Save the result image.save(output_image_file_path) - - -if __name__ == '__main__': - with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f: - xml_string = f.read() - image_file_path = 'screenshot.png' # Replace with your actual screenshot image path - output_image_file_path = 'annotated_screenshot.png' # Replace with your desired output image path - - leaf_nodes = find_leaf_nodes(xml_string) - filtered_nodes = filter_nodes(leaf_nodes) - print(f"Found {len(filtered_nodes)} filtered nodes") - - for node in filtered_nodes: - print(node.tag, node.attrib) - - draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path) diff --git a/mm_agents/gpt_4_agent.py b/mm_agents/gpt_4_agent.py index 57a1634..aa19185 100644 --- a/mm_agents/gpt_4_agent.py +++ b/mm_agents/gpt_4_agent.py @@ -61,11 +61,27 @@ def parse_code_from_string(input_string): # so the code inside backticks can span multiple lines. # matches now contains all the captured code snippets - return matches + + codes = [] + + for match in matches: + match = match.strip() + commands = ['WAIT', 'DONE', 'FAIL'] # fixme: updates this part when we have more commands + + if match in commands: + codes.append(match.strip()) + elif match.split('\n')[-1] in commands: + if len(match.split('\n')) > 1: + codes.append("\n".join(match.split('\n')[:-1])) + codes.append(match.split('\n')[-1]) + else: + codes.append(match) + + return codes class GPT4_Agent: - def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"): + def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"): self.instruction = instruction self.model = model self.max_tokens = max_tokens @@ -121,14 +137,17 @@ class GPT4_Agent: ] }) + # print( + # "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format( + # linearized_accessibility_tree) + # ) + traj_to_show = [] for i in range(len(self.trajectory)): traj_to_show.append(self.trajectory[i]["content"][0]["text"]) if len(self.trajectory[i]["content"]) > 1: traj_to_show.append("screenshot_obs") - print("Trajectory:", traj_to_show) - payload = { "model": self.model, "messages": self.trajectory, diff --git a/mm_agents/gui_som/READAME.md b/mm_agents/gui_som/READAME.md new file mode 100644 index 0000000..05b15ba --- /dev/null +++ b/mm_agents/gui_som/READAME.md @@ -0,0 +1 @@ +Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available. \ No newline at end of file