fix(vllm_eval): add image compression to prevent 413 error with large max_steps

This commit is contained in:
cui0711
2026-02-09 14:24:59 +08:00
parent 9bc54c0a66
commit 3890ee5fc3

View File

@@ -4,11 +4,67 @@ from dotenv import load_dotenv
import logging import logging
import base64 import base64
import glob import glob
from io import BytesIO
from PIL import Image
logger = logging.getLogger("desktopenv.vllm_eval") logger = logging.getLogger("desktopenv.vllm_eval")
load_dotenv() load_dotenv()
def _compress_image(img_b64: str, max_size: int = 800, quality: int = 85) -> str:
"""
Compress base64 encoded image to reduce size
Args:
img_b64: Base64 encoded image string
max_size: Maximum dimension (width or height) in pixels
quality: JPEG quality (1-100), lower means smaller file size
Returns:
Compressed base64 encoded image string
"""
try:
# Decode base64 to image
img_data = base64.b64decode(img_b64)
img = Image.open(BytesIO(img_data))
# Convert to RGB if necessary (for PNG with transparency)
if img.mode in ('RGBA', 'LA', 'P'):
background = Image.new('RGB', img.size, (255, 255, 255))
if img.mode == 'P':
img = img.convert('RGBA')
background.paste(img, mask=img.split()[-1] if img.mode in ('RGBA', 'LA') else None)
img = background
# Resize if image is too large
original_size = img.size
if max(img.size) > max_size:
ratio = max_size / max(img.size)
new_size = tuple(int(dim * ratio) for dim in img.size)
img = img.resize(new_size, Image.Resampling.LANCZOS)
logger.info(f"Resized image from {original_size} to {new_size}")
# Compress to JPEG
buffer = BytesIO()
img.save(buffer, format='JPEG', quality=quality, optimize=True)
compressed_data = buffer.getvalue()
# Encode back to base64
compressed_b64 = base64.b64encode(compressed_data).decode('utf-8')
# Log compression ratio
original_size_kb = len(img_b64) * 3 / 4 / 1024 # base64 to bytes to KB
compressed_size_kb = len(compressed_b64) * 3 / 4 / 1024
compression_ratio = (1 - compressed_size_kb / original_size_kb) * 100
logger.info(f"Compressed image: {original_size_kb:.1f}KB -> {compressed_size_kb:.1f}KB ({compression_ratio:.1f}% reduction)")
return compressed_b64
except Exception as e:
logger.warning(f"Failed to compress image: {e}, using original")
return img_b64
class UnifiedLLM: class UnifiedLLM:
def __init__(self, model: str): def __init__(self, model: str):
@@ -131,276 +187,115 @@ class UnifiedLLM:
self, self,
prompt: str, prompt: str,
images_b64: List[str], images_b64: List[str],
batch_size: int = 3,
temperature: float = 0.7, temperature: float = 0.7,
max_tokens: int = 16384, max_tokens: int = 16384,
top_p: float = 1.0, top_p: float = 1.0,
**kwargs **kwargs
) -> str: ) -> str:
""" """
Generate with multiple images by batching Generate with multiple images in a single request
Args: Args:
prompt: Base instruction prompt prompt: Instruction prompt
images_b64: List of base64 encoded images images_b64: List of base64 encoded images
batch_size: Number of images per batch
temperature: Temperature (0.0-2.0) temperature: Temperature (0.0-2.0)
max_tokens: Maximum number of tokens max_tokens: Maximum number of tokens
top_p: Top-p sampling (0.0-1.0) top_p: Top-p sampling (0.0-1.0)
Returns: Returns:
Final generated text Generated text
""" """
if not images_b64: if not images_b64:
logger.warning("No images provided, falling back to text-only generation") logger.warning("No images provided, falling back to text-only generation")
return self.generate(prompt, temperature, max_tokens, top_p, **kwargs) return self.generate(prompt, temperature, max_tokens, top_p, **kwargs)
params = self._get_supported_params(temperature, max_tokens, top_p) params = self._get_supported_params(temperature, max_tokens, top_p)
total_batches = (len(images_b64) + batch_size - 1) // batch_size
if self.provider == "openai": if self.provider == "openai":
return self._generate_with_images_openai( # Build content with text and all images
prompt, images_b64, batch_size, total_batches, params content = [{"type": "text", "text": prompt}]
)
elif self.provider == "anthropic":
return self._generate_with_images_anthropic(
prompt, images_b64, batch_size, total_batches, params
)
elif self.provider == "gemini":
return self._generate_with_images_gemini(
prompt, images_b64, batch_size, total_batches, params
)
else:
raise ValueError(f"Unsupported provider: {self.provider}")
def _generate_with_images_openai( for img_b64 in images_b64:
self,
prompt: str,
images_b64: List[str],
batch_size: int,
total_batches: int,
params: Dict[str, Any]
) -> str:
"""OpenAI implementation for batched image generation"""
messages = []
for batch_idx in range(total_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(images_b64))
batch_images = images_b64[start_idx:end_idx]
# Build content for this batch
content = []
if batch_idx == 0:
# First batch: include the main instruction
content.append({
"type": "text",
"text": f"""{prompt}
I will send you images in {total_batches} batch(es). Please acknowledge each batch but DO NOT provide your final evaluation until I explicitly say "ALL IMAGES SENT. Please provide your evaluation now."
This is batch {batch_idx + 1}/{total_batches}."""
})
else:
content.append({
"type": "text",
"text": f"This is batch {batch_idx + 1}/{total_batches}. Please acknowledge receipt."
})
# Add images
for img_b64 in batch_images:
content.append({ content.append({
"type": "image_url", "type": "image_url",
"image_url": { "image_url": {
"url": f"data:image/png;base64,{img_b64}" "url": f"data:image/jpeg;base64,{img_b64}"
} }
}) })
messages.append({"role": "user", "content": content}) try:
response = self.client.chat.completions.create(
model=self.model,
messages=[{"role": "user", "content": content}],
**params
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"OpenAI API error: {e}")
raise e
# Get acknowledgment (except for last batch) elif self.provider == "anthropic":
if batch_idx < total_batches - 1: # Build content with text and all images
try: content = [{"type": "text", "text": prompt}]
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
**params
)
assistant_msg = response.choices[0].message.content
messages.append({"role": "assistant", "content": assistant_msg})
logger.info(f"Batch {batch_idx + 1}/{total_batches} acknowledged")
except Exception as e:
logger.error(f"Error sending batch {batch_idx + 1}: {e}")
raise e
# Send final prompt for img_b64 in images_b64:
messages.append({
"role": "user",
"content": "ALL IMAGES SENT. Please provide your evaluation now."
})
try:
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
**params
)
return response.choices[0].message.content
except Exception as e:
logger.error(f"Error getting final evaluation: {e}")
raise e
def _generate_with_images_anthropic(
self,
prompt: str,
images_b64: List[str],
batch_size: int,
total_batches: int,
params: Dict[str, Any]
) -> str:
"""Anthropic implementation for batched image generation"""
messages = []
for batch_idx in range(total_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(images_b64))
batch_images = images_b64[start_idx:end_idx]
# Build content for this batch
content = []
if batch_idx == 0:
content.append({
"type": "text",
"text": f"""{prompt}
I will send you images in {total_batches} batch(es). Please acknowledge each batch but DO NOT provide your final evaluation until I explicitly say "ALL IMAGES SENT. Please provide your evaluation now."
This is batch {batch_idx + 1}/{total_batches}."""
})
else:
content.append({
"type": "text",
"text": f"This is batch {batch_idx + 1}/{total_batches}. Please acknowledge receipt."
})
# Add images
for img_b64 in batch_images:
content.append({ content.append({
"type": "image", "type": "image",
"source": { "source": {
"type": "base64", "type": "base64",
"media_type": "image/png", "media_type": "image/jpeg",
"data": img_b64 "data": img_b64
} }
}) })
messages.append({"role": "user", "content": content}) try:
response = self.client.messages.create(
model=self.model,
messages=[{"role": "user", "content": content}],
**params
)
return response.content[0].text
except Exception as e:
logger.error(f"Anthropic API error: {e}")
raise e
# Get acknowledgment (except for last batch) elif self.provider == "gemini":
if batch_idx < total_batches - 1: import google.generativeai as genai
try:
response = self.client.messages.create(
model=self.model,
messages=messages,
**params
)
assistant_msg = response.content[0].text
messages.append({"role": "assistant", "content": assistant_msg})
logger.info(f"Batch {batch_idx + 1}/{total_batches} acknowledged")
except Exception as e:
logger.error(f"Error sending batch {batch_idx + 1}: {e}")
raise e
# Send final prompt config = genai.GenerationConfig(
messages.append({ temperature=params["temperature"],
"role": "user", max_output_tokens=params["max_tokens"],
"content": "ALL IMAGES SENT. Please provide your evaluation now." top_p=params.get("top_p", 1.0)
})
try:
response = self.client.messages.create(
model=self.model,
messages=messages,
**params
) )
return response.content[0].text
except Exception as e:
logger.error(f"Error getting final evaluation: {e}")
raise e
def _generate_with_images_gemini( # Build content parts
self, content_parts = [prompt]
prompt: str,
images_b64: List[str],
batch_size: int,
total_batches: int,
params: Dict[str, Any]
) -> str:
"""Gemini implementation for batched image generation"""
import google.generativeai as genai
from PIL import Image
import io
config = genai.GenerationConfig( for img_b64 in images_b64:
temperature=params["temperature"],
max_output_tokens=params["max_tokens"],
top_p=params.get("top_p", 1.0)
)
chat = self.client.start_chat()
for batch_idx in range(total_batches):
start_idx = batch_idx * batch_size
end_idx = min(start_idx + batch_size, len(images_b64))
batch_images = images_b64[start_idx:end_idx]
# Build content for this batch
content_parts = []
if batch_idx == 0:
content_parts.append(f"""{prompt}
I will send you images in {total_batches} batch(es). Please acknowledge each batch but DO NOT provide your final evaluation until I explicitly say "ALL IMAGES SENT. Please provide your evaluation now."
This is batch {batch_idx + 1}/{total_batches}.""")
else:
content_parts.append(f"This is batch {batch_idx + 1}/{total_batches}. Please acknowledge receipt.")
# Add images
for img_b64 in batch_images:
img_data = base64.b64decode(img_b64) img_data = base64.b64decode(img_b64)
img = Image.open(io.BytesIO(img_data)) img = Image.open(BytesIO(img_data))
content_parts.append(img) content_parts.append(img)
# Get acknowledgment (except for last batch) try:
if batch_idx < total_batches - 1: response = self.client.generate_content(content_parts, generation_config=config)
try: return response.text
response = chat.send_message(content_parts, generation_config=config) except Exception as e:
logger.info(f"Batch {batch_idx + 1}/{total_batches} acknowledged") logger.error(f"Gemini API error: {e}")
except Exception as e: raise e
logger.error(f"Error sending batch {batch_idx + 1}: {e}")
raise e
# Send final prompt else:
try: raise ValueError(f"Unsupported provider: {self.provider}")
response = chat.send_message(
"ALL IMAGES SENT. Please provide your evaluation now.",
generation_config=config
)
return response.text
except Exception as e:
logger.error(f"Error getting final evaluation: {e}")
raise e
def _load_screenshots_from_dir(result_dir: str) -> List[str]: def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> List[str]:
""" """
Load all step screenshots from result directory and convert to base64 Load all step screenshots from result directory and convert to base64
Args: Args:
result_dir: Path to result directory containing step_*.png files result_dir: Path to result directory containing step_*.png files
compress: Whether to compress images (default: True)
max_size: Maximum dimension for compression (default: 800)
quality: JPEG quality for compression (default: 85)
Returns: Returns:
List of base64 encoded screenshot strings List of base64 encoded screenshot strings
@@ -420,6 +315,11 @@ def _load_screenshots_from_dir(result_dir: str) -> List[str]:
with open(filepath, "rb") as f: with open(filepath, "rb") as f:
img_data = f.read() img_data = f.read()
img_b64 = base64.b64encode(img_data).decode('utf-8') img_b64 = base64.b64encode(img_data).decode('utf-8')
# Compress if enabled
if compress:
img_b64 = _compress_image(img_b64, max_size=max_size, quality=quality)
screenshots.append(img_b64) screenshots.append(img_b64)
except Exception as e: except Exception as e:
logger.error(f"Error loading screenshot {filepath}: {e}") logger.error(f"Error loading screenshot {filepath}: {e}")
@@ -436,10 +336,12 @@ def vllm_eval(result_state, **options) -> float:
result_state: Current state description result_state: Current state description
**options: Additional options including: **options: Additional options including:
- result_dir: Path to result directory containing step screenshots (recommended) - result_dir: Path to result directory containing step screenshots (recommended)
- screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead) - screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
- instruction: Task instruction - instruction: Task instruction
- eval_model: Model name to use - eval_model: Model name to use
- batch_size: Number of images per batch (default: 3) - compress_images: Whether to compress images (default: True)
- max_image_size: Maximum image dimension for compression (default: 800)
- image_quality: JPEG quality for compression (default: 85)
- temperature: Temperature parameter - temperature: Temperature parameter
- max_tokens: Maximum tokens - max_tokens: Maximum tokens
- top_p: Top-p parameter - top_p: Top-p parameter
@@ -451,15 +353,28 @@ def vllm_eval(result_state, **options) -> float:
result_dir = options.get("result_dir", None) result_dir = options.get("result_dir", None)
screenshots = options.get("screenshots", []) screenshots = options.get("screenshots", [])
# Image compression options
compress_images = options.get("compress_images", True)
max_image_size = options.get("max_image_size", 800)
image_quality = options.get("image_quality", 85)
if result_dir and not screenshots: if result_dir and not screenshots:
screenshots = _load_screenshots_from_dir(result_dir) screenshots = _load_screenshots_from_dir(
result_dir,
compress=compress_images,
max_size=max_image_size,
quality=image_quality
)
logger.info(f"Loaded {len(screenshots)} screenshots from result_dir: {result_dir}") logger.info(f"Loaded {len(screenshots)} screenshots from result_dir: {result_dir}")
elif screenshots: elif screenshots:
logger.info(f"Using {len(screenshots)} screenshots from options") logger.info(f"Using {len(screenshots)} screenshots from options")
# Compress screenshots if needed
if compress_images:
logger.info("Compressing provided screenshots...")
screenshots = [_compress_image(img, max_size=max_image_size, quality=image_quality) for img in screenshots]
instruction = options.get("instruction", "") instruction = options.get("instruction", "")
eval_model = options.get("eval_model", "gpt-4-vision-preview") eval_model = options.get("eval_model", "gpt-4-vision-preview")
batch_size = options.get("batch_size", 3)
params = { params = {
"temperature": options.get("temperature", 0.7), "temperature": options.get("temperature", 0.7),
@@ -500,7 +415,6 @@ Remember: Return ONLY the JSON object, no additional text."""
result = llm.generate_with_images( result = llm.generate_with_images(
prompt=prompt, prompt=prompt,
images_b64=screenshots, images_b64=screenshots,
batch_size=batch_size,
**params **params
) )