Fix one multi_app example; remove some broken examples; Support downsampling

2024-03-21 22:05:16 +08:00
parent 7ca91ca8c9
commit 3ce7636abd
5 changed files with 11 additions and 62 deletions
--- a/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
+++ b/mm_agents/accessibility_tree_wrap/heuristic_retrieve.py
@@ -80,9 +80,11 @@ def filter_nodes(root: ET, platform="ubuntu", check_image=False):
    return filtered_nodes


-def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
+def draw_bounding_boxes(nodes, image_file_path, output_image_file_path, down_sampling_ratio=1.0):
    # Load the screenshot image
    image = Image.open(image_file_path)
+    if float(down_sampling_ratio) != 1.0:
+        image = image.resize((int(image.size[0] * down_sampling_ratio), int(image.size[1] * down_sampling_ratio)))
    draw = ImageDraw.Draw(image)
    marks = []
    drew_nodes = []
@@ -108,6 +110,11 @@ def draw_bounding_boxes(nodes, image_file_path, output_image_file_path):
                coords = tuple(map(int, coords_str.strip('()').split(', ')))
                size = tuple(map(int, size_str.strip('()').split(', ')))

+                if float(down_sampling_ratio) != 1.0:
+                    # Downsample the coordinates and size
+                    coords = tuple(int(coord * down_sampling_ratio) for coord in coords)
+                    size = tuple(int(s * down_sampling_ratio) for s in size)
+
                # Check for negative sizes
                if size[0] <= 0 or size[1] <= 0:
                    raise ValueError(f"Size must be positive, got: {size}")
--- a/mm_agents/agent.py
+++ b/mm_agents/agent.py
@@ -6,17 +6,14 @@ import re
 import time
 import uuid
 import xml.etree.ElementTree as ET
-import numpy as np
 from http import HTTPStatus
 from io import BytesIO
 from typing import Dict, List, Tuple, Union
-
 import backoff
 import dashscope
 import google.generativeai as genai
 import openai
 import requests
-import cv2
 from PIL import Image
 from google.api_core.exceptions import InvalidArgument

@@ -28,14 +25,6 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S

 logger = logging.getLogger("desktopenv.agent")

-def downsample_image(img: Union[str, np.ndarray], ratio: Tuple[float, float]):
-    fx, fy = ratio
-    if isinstance(img, str):
-        img = cv2.imread(img)
-
-    resized = cv2.resize(img, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA)
-    return resized
-

 # Function to encode the image
 def encode_image(image_path):