diff --git a/mm_agents/agent.py b/mm_agents/agent.py index f2d4b5c..cad6ede 100644 --- a/mm_agents/agent.py +++ b/mm_agents/agent.py @@ -6,15 +6,17 @@ import re import time import uuid import xml.etree.ElementTree as ET +import numpy as np from http import HTTPStatus from io import BytesIO -from typing import Dict, List +from typing import Dict, List, Tuple, Union import backoff import dashscope import google.generativeai as genai import openai import requests +import cv2 from PIL import Image from google.api_core.exceptions import InvalidArgument @@ -26,6 +28,14 @@ from mm_agents.prompts import SYS_PROMPT_IN_SCREENSHOT_OUT_CODE, SYS_PROMPT_IN_S logger = logging.getLogger("desktopenv.agent") +def downsample_image(img: Union[str, np.ndarray], ratio: Tuple[float, float]): + fx, fy = ratio + if isinstance(img, str): + img = cv2.imread(img) + + resized = cv2.resize(img, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA) + return resized + # Function to encode the image def encode_image(image_path):