FIx corner cases (val connection in chrome when using playwright, and action parsing for agent, and accessibility tree xml handling)
This commit is contained in:
@@ -159,9 +159,19 @@ def get_open_tabs_info(env, config: Dict[str, str]):
|
|||||||
tabs_info = []
|
tabs_info = []
|
||||||
for context in browser.contexts:
|
for context in browser.contexts:
|
||||||
for page in context.pages:
|
for page in context.pages:
|
||||||
title = page.title()
|
try:
|
||||||
url = page.url
|
# Wait for the page to finish loading, this prevents the "execution context was destroyed" issue
|
||||||
tabs_info.append({'title': title, 'url': url})
|
page.wait_for_load_state('load') # Wait for the 'load' event to complete
|
||||||
|
title = page.title()
|
||||||
|
url = page.url
|
||||||
|
tabs_info.append({'title': title, 'url': url})
|
||||||
|
except TimeoutError:
|
||||||
|
# If page loading times out, catch the exception and store the current information in the list
|
||||||
|
tabs_info.append({'title': 'Load timeout', 'url': page.url})
|
||||||
|
except Exception as e:
|
||||||
|
# Catch other potential exceptions that might occur while reading the page title
|
||||||
|
print(f'Error: {e}')
|
||||||
|
tabs_info.append({'title': 'Error encountered', 'url': page.url})
|
||||||
|
|
||||||
browser.close()
|
browser.close()
|
||||||
return tabs_info
|
return tabs_info
|
||||||
|
|||||||
@@ -111,21 +111,25 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
|
|||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
action_space = "pyautogui"
|
action_space = "pyautogui"
|
||||||
example_class = "chrome"
|
example_class = "chrome"
|
||||||
example_id = "06fe7178-4491-4589-810f-2e2bc9502122"
|
example_id = "7a5a7856-f1b6-42a4-ade9-1ca81ca0f263"
|
||||||
|
gpt4_model = "gpt-4-1106-preview"
|
||||||
|
gemini_model = "gemini-pro-vision"
|
||||||
|
|
||||||
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
|
with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r") as f:
|
||||||
example = json.load(f)
|
example = json.load(f)
|
||||||
example["snapshot"] = "exp_setup4"
|
example["snapshot"] = "exp_setup4"
|
||||||
|
|
||||||
api_key = os.environ.get("OPENAI_API_KEY")
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
agent = GPT4_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
agent = GPT4_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], action_space=action_space)
|
||||||
|
|
||||||
# api_key = os.environ.get("GENAI_API_KEY")
|
# api_key = os.environ.get("GENAI_API_KEY")
|
||||||
# agent = GeminiPro_Agent(api_key=api_key, instruction=example['instruction'], action_space=action_space)
|
# agent = GeminiPro_Agent(api_key=api_key, model=gemini_model, instruction=example['instruction'], action_space=action_space)
|
||||||
|
|
||||||
root_trajectory_dir = "exp_trajectory"
|
root_trajectory_dir = "exp_trajectory"
|
||||||
|
|
||||||
example_trajectory_dir = os.path.join(root_trajectory_dir, example_class, example_id)
|
example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gpt4_model, example_id)
|
||||||
|
# example_trajectory_dir = os.path.join(root_trajectory_dir, "text", example_class, gemini_model, example_id)
|
||||||
|
|
||||||
os.makedirs(example_trajectory_dir, exist_ok=True)
|
os.makedirs(example_trajectory_dir, exist_ok=True)
|
||||||
|
|
||||||
run_one_example(example, agent, 10, example_trajectory_dir)
|
run_one_example(example, agent, 15, example_trajectory_dir)
|
||||||
|
|||||||
@@ -3,8 +3,11 @@ import xml.etree.ElementTree as ET
|
|||||||
from PIL import Image, ImageDraw, ImageFont
|
from PIL import Image, ImageDraw, ImageFont
|
||||||
|
|
||||||
|
|
||||||
def find_leaf_nodes(xml_file_path):
|
def find_leaf_nodes(xlm_file_str):
|
||||||
root = ET.fromstring(xml_file_path)
|
if not xlm_file_str:
|
||||||
|
return []
|
||||||
|
|
||||||
|
root = ET.fromstring(xlm_file_str)
|
||||||
|
|
||||||
# Recursive function to traverse the XML tree and collect leaf nodes
|
# Recursive function to traverse the XML tree and collect leaf nodes
|
||||||
def collect_leaf_nodes(node, leaf_nodes):
|
def collect_leaf_nodes(node, leaf_nodes):
|
||||||
@@ -97,19 +100,3 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
|
|||||||
|
|
||||||
# Save the result
|
# Save the result
|
||||||
image.save(output_image_file_path)
|
image.save(output_image_file_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
with open('chrome_desktop_example_1.xml', 'r', encoding='utf-8') as f:
|
|
||||||
xml_string = f.read()
|
|
||||||
image_file_path = 'screenshot.png' # Replace with your actual screenshot image path
|
|
||||||
output_image_file_path = 'annotated_screenshot.png' # Replace with your desired output image path
|
|
||||||
|
|
||||||
leaf_nodes = find_leaf_nodes(xml_string)
|
|
||||||
filtered_nodes = filter_nodes(leaf_nodes)
|
|
||||||
print(f"Found {len(filtered_nodes)} filtered nodes")
|
|
||||||
|
|
||||||
for node in filtered_nodes:
|
|
||||||
print(node.tag, node.attrib)
|
|
||||||
|
|
||||||
draw_bounding_boxes(filtered_nodes, image_file_path, output_image_file_path)
|
|
||||||
|
|||||||
@@ -61,11 +61,27 @@ def parse_code_from_string(input_string):
|
|||||||
# so the code inside backticks can span multiple lines.
|
# so the code inside backticks can span multiple lines.
|
||||||
|
|
||||||
# matches now contains all the captured code snippets
|
# matches now contains all the captured code snippets
|
||||||
return matches
|
|
||||||
|
codes = []
|
||||||
|
|
||||||
|
for match in matches:
|
||||||
|
match = match.strip()
|
||||||
|
commands = ['WAIT', 'DONE', 'FAIL'] # fixme: updates this part when we have more commands
|
||||||
|
|
||||||
|
if match in commands:
|
||||||
|
codes.append(match.strip())
|
||||||
|
elif match.split('\n')[-1] in commands:
|
||||||
|
if len(match.split('\n')) > 1:
|
||||||
|
codes.append("\n".join(match.split('\n')[:-1]))
|
||||||
|
codes.append(match.split('\n')[-1])
|
||||||
|
else:
|
||||||
|
codes.append(match)
|
||||||
|
|
||||||
|
return codes
|
||||||
|
|
||||||
|
|
||||||
class GPT4_Agent:
|
class GPT4_Agent:
|
||||||
def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=300, action_space="computer_13"):
|
def __init__(self, api_key, instruction, model="gpt-4-1106-preview", max_tokens=600, action_space="computer_13"):
|
||||||
self.instruction = instruction
|
self.instruction = instruction
|
||||||
self.model = model
|
self.model = model
|
||||||
self.max_tokens = max_tokens
|
self.max_tokens = max_tokens
|
||||||
@@ -121,14 +137,17 @@ class GPT4_Agent:
|
|||||||
]
|
]
|
||||||
})
|
})
|
||||||
|
|
||||||
|
# print(
|
||||||
|
# "Given the XML format of accessibility tree as below:\n{}\nWhat's the next step that you will do to help with the task?".format(
|
||||||
|
# linearized_accessibility_tree)
|
||||||
|
# )
|
||||||
|
|
||||||
traj_to_show = []
|
traj_to_show = []
|
||||||
for i in range(len(self.trajectory)):
|
for i in range(len(self.trajectory)):
|
||||||
traj_to_show.append(self.trajectory[i]["content"][0]["text"])
|
traj_to_show.append(self.trajectory[i]["content"][0]["text"])
|
||||||
if len(self.trajectory[i]["content"]) > 1:
|
if len(self.trajectory[i]["content"]) > 1:
|
||||||
traj_to_show.append("screenshot_obs")
|
traj_to_show.append("screenshot_obs")
|
||||||
|
|
||||||
print("Trajectory:", traj_to_show)
|
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": self.model,
|
"model": self.model,
|
||||||
"messages": self.trajectory,
|
"messages": self.trajectory,
|
||||||
|
|||||||
1
mm_agents/gui_som/READAME.md
Normal file
1
mm_agents/gui_som/READAME.md
Normal file
@@ -0,0 +1 @@
|
|||||||
|
Deprecated since we found we can use `accelaerator` to do the same thing. But can be potentially used in the future when only access to screen is available.
|
||||||
Reference in New Issue
Block a user