diff --git a/branch_flag b/branch_flag index 180b47c..7e5437f 100644 --- a/branch_flag +++ b/branch_flag @@ -1 +1 @@ -baseline +baseline_som diff --git a/experiment_screenshot_som.py b/experiment_screenshot_som.py index 2ecdafe..a8682ba 100644 --- a/experiment_screenshot_som.py +++ b/experiment_screenshot_som.py @@ -1,4 +1,4 @@ -import ctypes +#import ctypes import datetime import json import logging @@ -43,9 +43,11 @@ logger.addHandler(sdebug_handler) logger = logging.getLogger("desktopenv.experiment") -PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" +#PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx" +PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx" + def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True): trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json") env = DesktopEnv( @@ -125,14 +127,16 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr def main(example_class, example_id): action_space = "pyautogui" gpt4_model = "gpt-4-vision-preview" - gemini_model = "gemini-pro-vision" + #gemini_model = "gemini-pro-vision" with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f: example = json.load(f) - example["snapshot"] = "exp_v1" + #example["snapshot"] = "exp_v1" + # example["snapshot"] = "exp_setup4" + example["snapshot"] = "Snapshot 30" api_key = os.environ.get("OPENAI_API_KEY") - agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'], + agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, max_tokens=1000, instruction=example['instruction'], action_space=action_space, exp="som") # api_key = os.environ.get("GENAI_API_KEY") @@ -149,7 +153,7 @@ def main(example_class, example_id): if __name__ == '__main__': - xx_list = [ - ] + xx_list = [ "01b269ae-2111-4a07-81fd-3fcd711993b0" + ] for example_id in xx_list: - main("xx", example_id) + main("libreoffice_calc", example_id) diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index ccb4886..7ab439f 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -2,6 +2,7 @@ import xml.etree.ElementTree as ET from PIL import Image, ImageDraw, ImageFont +from typing import Tuple def find_leaf_nodes(xlm_file_str): if not xlm_file_str: @@ -105,15 +106,20 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): # Draw index number at the bottom left of the bounding box with black background text_position = (coords[0], bottom_right[1]) # Adjust Y to be above the bottom right - draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black') - draw.text(text_position, str(index), font=font, fill="white") + text_bbox: Tuple[int, int ,int ,int] = draw.textbbox(text_position, str(index), font=font, anchor="lb") + #offset: int = bottom_right[1]-text_bbox[3] + #text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset) + + #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black') + draw.rectangle(text_bbox, fill='black') + draw.text(text_position, str(index), font=font, anchor="lb", fill="white") index += 1 # each mark is an x, y, w, h tuple marks.append([coords[0], coords[1], size[0], size[1]]) drew_nodes.append(_node) - except ValueError as e: + except ValueError: pass # Save the result diff --git a/requirements.txt b/requirements.txt index b494102..23e6a27 100644 --- a/requirements.txt +++ b/requirements.txt @@ -38,3 +38,4 @@ pydrive fastdtw openai +func-timeout