ver Jan29th

updated the position of SoM marks
2024-01-29 21:49:53 +08:00
parent e37f0037c4
commit d8a497a417
4 changed files with 23 additions and 12 deletions
--- a/2
+++ b/2
@@ -1 +1 @@
-baseline
+baseline_som
--- a/experiment_screenshot_som.py
+++ b/experiment_screenshot_som.py
@@ -1,4 +1,4 @@
-import ctypes
+#import ctypes
 import datetime
 import json
 import logging
@@ -43,9 +43,11 @@ logger.addHandler(sdebug_handler)
 logger = logging.getLogger("desktopenv.experiment")
-PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
+#PATH_TO_VM = r"C:\Users\tianbaox\Documents\Virtual Machines\Ubuntu\Ubuntu.vmx"
 PATH_TO_VM = "../../../../大文件/镜像/Ubuntu-1218/Ubuntu/Ubuntu.vmx"
 def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_trajectory", recording=True):
    trajectory_recording_path = os.path.join(example_trajectory_dir, "trajectory.json")
    env = DesktopEnv(
@@ -125,14 +127,16 @@ def run_one_example(example, agent, max_steps=10, example_trajectory_dir="exp_tr
 def main(example_class, example_id):
    action_space = "pyautogui"
    gpt4_model = "gpt-4-vision-preview"
-    gemini_model = "gemini-pro-vision"
+    #gemini_model = "gemini-pro-vision"
    with open(f"evaluation_examples/examples/{example_class}/{example_id}.json", "r", encoding="utf-8") as f:
        example = json.load(f)
-    example["snapshot"] = "exp_v1"
+    #example["snapshot"] = "exp_v1"
    # example["snapshot"] = "exp_setup4"
    example["snapshot"] = "Snapshot 30"
    api_key = os.environ.get("OPENAI_API_KEY")
-    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, instruction=example['instruction'],
+    agent = GPT4v_Agent(api_key=api_key, model=gpt4_model, max_tokens=1000, instruction=example['instruction'],
                        action_space=action_space, exp="som")
    # api_key = os.environ.get("GENAI_API_KEY")
@@ -149,7 +153,7 @@ def main(example_class, example_id):
 if __name__ == '__main__':
-    xx_list = [
+    xx_list = [ "01b269ae-2111-4a07-81fd-3fcd711993b0"
-    ]
+              ]
    for example_id in xx_list:
-        main("xx", example_id)
+        main("libreoffice_calc", example_id)
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -2,6 +2,7 @@ import xml.etree.ElementTree as ET
 from PIL import Image, ImageDraw, ImageFont
 from typing import Tuple
 def find_leaf_nodes(xlm_file_str):
    if not xlm_file_str:
@@ -105,15 +106,20 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
                # Draw index number at the bottom left of the bounding box with black background
                text_position = (coords[0], bottom_right[1])  # Adjust Y to be above the bottom right
-                draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
+                text_bbox: Tuple[int, int ,int ,int] = draw.textbbox(text_position, str(index), font=font, anchor="lb")
-                draw.text(text_position, str(index), font=font, fill="white")
+                #offset: int = bottom_right[1]-text_bbox[3]
                #text_bbox = (text_bbox[0], text_bbox[1]+offset, text_bbox[2], text_bbox[3]+offset)
                #draw.rectangle([text_position, (text_position[0] + 25, text_position[1] + 18)], fill='black')
                draw.rectangle(text_bbox, fill='black')
                draw.text(text_position, str(index), font=font, anchor="lb", fill="white")
                index += 1
                # each mark is an x, y, w, h tuple
                marks.append([coords[0], coords[1], size[0], size[1]])
                drew_nodes.append(_node)
-            except ValueError as e:
+            except ValueError:
                pass
    # Save the result
--- a/requirements.txt
+++ b/requirements.txt
@@ -38,3 +38,4 @@ pydrive
 fastdtw
 openai
 func-timeout