diff --git a/evaluation_examples/examples/multi_apps/7f35355e-02a6-45b5-b140-f0be698bcf85.json b/evaluation_examples/examples/multi_apps/7f35355e-02a6-45b5-b140-f0be698bcf85.json index c33b042..f9161d7 100644 --- a/evaluation_examples/examples/multi_apps/7f35355e-02a6-45b5-b140-f0be698bcf85.json +++ b/evaluation_examples/examples/multi_apps/7f35355e-02a6-45b5-b140-f0be698bcf85.json @@ -9,7 +9,7 @@ "parameters": { "files": [ { - "url": "https://docs.google.com/spreadsheets/d/13YL-KC__pav2qp3sFDs1BT2wZnpWGp7s/export?format=xlsx", + "url": "https://drive.google.com/uc?export=download&id=1B5GmhdVD07UeYj9Ox20DHsA_gaxEFQ6Z", "path": "/home/user/Desktop/stock.xlsx" } ] @@ -36,7 +36,7 @@ }, "expected": { "type": "cloud_file", - "path": "https://drive.google.com/uc?export=download&id=1oPPW_dozWGII5MRmdXdKKoEK5iBkd_8Q", + "path": "https://drive.google.com/uc?export=download&id=1wzlUL1gktA0d_j9W3WSSAAUcuKr5gw-n", "dest": "result_gold.txt" } } diff --git a/evaluation_examples/test_all.json b/evaluation_examples/test_all.json index e530435..798e858 100644 --- a/evaluation_examples/test_all.json +++ b/evaluation_examples/test_all.json @@ -304,15 +304,12 @@ "os": [ "94d95f96-9699-4208-98ba-3c3119edf9c2", "bedcedc4-4d72-425e-ad62-21960b11fe0d", - "43c2d64c-bab5-4dcb-a30c-b888321c319a", - "7688b85f-87a4-4e4a-b2f8-f3d6c3f29b82", "ec4e3f68-9ea4-4c18-a5c9-69f89d1178b3", "a462a795-fdc7-4b23-b689-e8b6df786b78", "f9be0997-4b7c-45c5-b05c-4612b44a6118", "28cc3b7e-b194-4bc9-8353-d04c0f4d56d2", "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", "e0df059f-28a6-4169-924f-b9623e7184cc", - "ddc75b62-7311-4af8-bfb3-859558542b36", "b6781586-6346-41cd-935a-a6b1487918fc", "b3d4a89c-53f2-4d6b-8b6a-541fb5d205fa", "3ce045a0-877b-42aa-8d2c-b4a863336ab8", @@ -322,8 +319,6 @@ "23393935-50c7-4a86-aeea-2b78fd089c5c", "5812b315-e7bd-4265-b51f-863c02174c28", "c288e301-e626-4b98-a1ab-159dcb162af5", - "cc9d4f34-1ca0-4a1b-8ff2-09302696acb9", - "c56de254-a3ec-414e-81a6-83d2ce8c41fa", "4783cc41-c03c-4e1b-89b4-50658f642bd5", "5c1075ca-bb34-46a3-a7a0-029bd7463e79", "5ced85fc-fa1a-4217-95fd-0fb530545ce2", @@ -376,7 +371,6 @@ "4e60007a-f5be-4bfc-9723-c39affa0a6d3", "e2b5e914-ffe1-44d2-8e92-58f8c5d92bb2", "9439a27b-18ae-42d8-9778-5f68f891805e", - "ae506c68-352c-4094-9caa-ee9d42052317", "ea98c5d7-3cf9-4f9b-8ad3-366b58e0fcae", "930fdb3b-11a8-46fe-9bac-577332e2640e", "276cc624-87ea-4f08-ab93-f770e3790175", diff --git a/evaluation_examples/test_small.json b/evaluation_examples/test_small.json index aec99fc..7a072f5 100644 --- a/evaluation_examples/test_small.json +++ b/evaluation_examples/test_small.json @@ -37,48 +37,7 @@ "eb303e01-261e-4972-8c07-c9b4e7a4922a", "d1acdb87-bb67-4f30-84aa-990e56a09c92", "deec51c9-3b1e-4b9e-993c-4776f20e8bb2", - "8e116af7-7db7-4e35-a68b-b0939c066c78", - "185f29bd-5da0-40a6-b69c-ba7f4e0324ef", - "2c1ebcd7-9c6d-4c9a-afad-900e381ecd5e", - "3a93cae4-ad3e-403e-8c12-65303b271818", - "1f18aa87-af6f-41ef-9853-cdb8f32ebdea", - "26150609-0da3-4a7d-8868-0faf9c5f01bb", - "7e287123-70ca-47b9-8521-47db09b69b14", - "e2392362-125e-4f76-a2ee-524b183a3412", - "26660ad1-6ebb-4f59-8cba-a8432dfe8d38", - "a82b78bb-7fde-4cb3-94a4-035baf10bcf0", - "36037439-2044-4b50-b9d1-875b5a332143", - "716a6079-22da-47f1-ba73-c9d58f986a38", - "a74b607e-6bb5-4ea8-8a7c-5d97c7bbcd2a", - "6f4073b8-d8ea-4ade-8a18-c5d1d5d5aa9a", - "da922383-bfa4-4cd3-bbad-6bebab3d7742", - "2373b66a-092d-44cb-bfd7-82e86e7a3b4d", - "81c425f5-78f3-4771-afd6-3d2973825947", - "227d2f97-562b-4ccb-ae47-a5ec9e142fbb", - "20236825-b5df-46e7-89bf-62e1d640a897", - "02ce9a50-7af2-47ed-8596-af0c230501f8", - "4c26e3f3-3a14-4d86-b44a-d3cedebbb487", - "09a37c51-e625-49f4-a514-20a773797a8a", - "3e3fc409-bff3-4905-bf16-c968eee3f807", - "415ef462-bed3-493a-ac36-ca8c6d23bf1b", - "9f3bb592-209d-43bc-bb47-d77d9df56504", - "dd60633f-2c72-42ba-8547-6f2c8cb0fdb0", - "3f05f3b9-29ba-4b6b-95aa-2204697ffc06", - "f8369178-fafe-40c2-adc4-b9b08a125456", - "778efd0a-153f-4842-9214-f05fc176b877", - "47f7c0ce-a5fb-4100-a5e6-65cd0e7429e5", - "c2751594-0cd5-4088-be1b-b5f2f9ec97c4", - "48c46dc7-fe04-4505-ade7-723cba1aa6f6", - "42d25c08-fb87-4927-8b65-93631280a26f", - "3c8f201a-009d-4bbe-8b65-a6f8b35bb57f", - "d68204bf-11c1-4b13-b48b-d303c73d4bf6", - "91190194-f406-4cd6-b3f9-c43fac942b22", - "7f35355e-02a6-45b5-b140-f0be698bcf85", - "98e8e339-5f91-4ed2-b2b2-12647cb134f4", - "df67aebb-fb3a-44fd-b75b-51b6012df509", - "5df7b33a-9f77-4101-823e-02f863e1c1ae", - "22a4636f-8179-4357-8e87-d1743ece1f81", - "236833a3-5704-47fc-888c-4f298f09f799" + "8e116af7-7db7-4e35-a68b-b0939c066c78" ], "os": [ "5ea617a3-0e86-4ba6-aab2-dac9aa2e8d57", diff --git a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py index 934d8fd..e2845f3 100644 --- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py +++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py @@ -80,9 +80,11 @@ def filter_nodes(root: ET, platform="ubuntu", check_image=False): return filtered_nodes -def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): +def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sampling_ratio=1.0): # Load the screenshot image image = Image.open(image_file_path) + if float(down_sampling_ratio) != 1.0: + image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio))) draw = ImageDraw.Draw(image) marks = [] drew_nodes = [] @@ -108,6 +110,11 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path): coords = tuple(map(int, coords_str.strip('()').split(', '))) size = tuple(map(int, size_str.strip('()').split(', '))) + if float(down_sampling_ratio) != 1.0: + # Downsample the coordinates and size + coords = tuple(int(coord * down_sampling_ratio) for coord in coords) + size = tuple(int(s * down_sampling_ratio) for s in size) + # Check for negative sizes if size[0] <= 0 or size[1] <= 0: raise ValueError(f"Size must be positive, got: {size}") diff --git a/mm_agents/agent.py b/mm_agents/agent.py index d7a5586..4600628 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -6,17 +6,14 @@ import re import time import uuid import xml.etree.ElementTree as ET -import numpy as np from http import HTTPStatus from io import BytesIO from typing import Dict, List, Tuple, Union - import backoff import dashscope import google.generativeai as genai import openai import requests -import cv2 from PIL import Image from google.api_core.exceptions import InvalidArgument @@ -28,14 +25,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S logger = logging.getLogger("desktopenv.agent") -def downsample_image(img: Union[str, np.ndarray], ratio: Tuple[float, float]): - fx, fy = ratio - if isinstance(img, str): - img = cv2.imread(img) - - resized = cv2.resize(img, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA) - return resized - # Function to encode the image def encode_image(image_path):