Files
sci-gui-agent-benchmark/reeval.py
lizhanyuan 252d2f79ce fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑
- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序
- 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST)
- 评估temperature从0.7降至0.2提升一致性
- 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹
- 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation)
- test_final.json新增avogadro(11题)和origin(8题)
2026-03-27 14:34:32 +08:00

310 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
Re-evaluation script for vllm_eval tasks.
Usage:
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10
"""
import argparse
import json
import os
import sys
import glob
import base64
import re
import logging
from io import BytesIO
from PIL import Image
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger("reeval")
MAX_IMAGES = 10 # hard API limit
# Defaults matching run_proxmox.sh
_DEFAULT_API_KEY = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
_DEFAULT_BASE_URL = "https://vip.apiyi.com/v1"
_DEFAULT_MODEL = "gemini-3.1-pro-preview"
# ── image helpers ──────────────────────────────────────────────────────────────
def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str:
try:
img_data = base64.b64decode(img_b64)
img = Image.open(BytesIO(img_data))
if img.mode in ("RGBA", "LA", "P"):
bg = Image.new("RGB", img.size, (255, 255, 255))
if img.mode == "P":
img = img.convert("RGBA")
bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None)
img = bg
if max(img.size) > max_size:
ratio = max_size / max(img.size)
img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS)
buf = BytesIO()
img.save(buf, format="JPEG", quality=quality, optimize=True)
return base64.b64encode(buf.getvalue()).decode()
except Exception as e:
logger.warning(f"Compression failed: {e}")
return img_b64
def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True):
"""
Load up to max_images screenshots, always keeping FIRST and LAST,
sampling middle frames evenly. Returns (b64_list, name_list).
"""
pattern = os.path.join(result_dir, "step_*.png")
files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1)))
if not files:
raise FileNotFoundError(f"No step_*.png found in {result_dir}")
n = len(files)
logger.info(f"Found {n} screenshots in {result_dir}")
if n <= max_images:
selected = files
else:
# Always keep first + last; fill rest with evenly-spaced middle frames
middle_slots = max_images - 2
step = (n - 2) / (middle_slots + 1)
indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1]
indices = sorted(set(indices))
selected = [files[i] for i in indices]
b64_list, name_list = [], []
for fp in selected:
with open(fp, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
if compress:
b64 = _compress_image(b64)
b64_list.append(b64)
m = re.match(r"(step_\d+)", os.path.basename(fp))
name_list.append(m.group(1) if m else os.path.basename(fp))
logger.info(f"Selected {len(name_list)} frames: {name_list}")
return b64_list, name_list
# ── LLM call ──────────────────────────────────────────────────────────────────
def _call_llm(model: str, prompt: str, images_b64: list) -> str:
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY", _DEFAULT_API_KEY)
base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL)
# gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com)
if model.startswith("gpt") or model.startswith("gemini"):
from openai import OpenAI
client = OpenAI(api_key=api_key, base_url=base_url)
content = [{"type": "text", "text": prompt}]
for b64 in images_b64:
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}})
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": content}],
temperature=0.2,
max_tokens=4096,
)
return resp.choices[0].message.content
elif model.startswith("claude"):
from anthropic import Anthropic
client = Anthropic(
base_url=os.getenv("ANTHROPIC_BASE_URL"),
api_key=os.getenv("ANTHROPIC_API_KEY"),
)
content = [{"type": "text", "text": prompt}]
for b64 in images_b64:
content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}})
resp = client.messages.create(
model=model,
messages=[{"role": "user", "content": content}],
temperature=0.2,
max_tokens=4096,
)
return resp.content[0].text
else:
raise ValueError(f"Unknown model prefix: {model}")
# ── prompt builder ─────────────────────────────────────────────────────────────
def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str:
# Pre-config section
preconfig_lines = []
for cfg in config:
if cfg.get("type") == "launch":
cmds = cfg.get("parameters", {}).get("command", [])
if cmds:
preconfig_lines.append(f" - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).")
elif cfg.get("type") == "open":
path = cfg.get("parameters", {}).get("path", "")
preconfig_lines.append(f" - '{path}' was auto-opened (NOT agent's work).")
preconfig_section = (
"PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" +
"\n".join(preconfig_lines)
) if preconfig_lines else ""
expected_steps = metadata.get("steps", "")
expected_section = (
f"EXPECTED STEPS (reference only):\n{expected_steps}"
) if expected_steps else ""
final_name = name_list[-1]
img_list_str = ", ".join(name_list)
prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
Task: {instruction}
{preconfig_section}
{expected_section}
You are provided with {len(name_list)} screenshots in chronological order: {img_list_str}
════════════════════════════════════════════════════
STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
Look at this image carefully NOW before anything else. Ask yourself:
"Does this final screenshot show the task is complete?"
Only after answering that, use earlier screenshots to understand HOW the agent got there.
════════════════════════════════════════════════════
SCORING RULES:
1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
3. Require VISIBLE evidence in a specific screenshot for each step.
4. If the final screenshot shows the task is done, score high even if earlier steps were messy.
SCORE GUIDE (010):
- 0: No agent progress; only pre-configured state visible.
- 13: Minor actions taken, far from goal.
- 46: Meaningful progress, roughly half done.
- 78: Most steps done, minor issues.
- 9: Essentially complete, cosmetic differences only.
- 10: Fully and perfectly complete with clear visual proof in the final screenshot.
Respond with ONLY valid JSON (no extra text):
{{
"final_screenshot": "{final_name}",
"final_screenshot_description": "Describe exactly what you see in {final_name}",
"task_complete_in_final": true/false,
"steps_analysis": [
{{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
],
"final_completion": "True/False",
"score": 0-10
}}"""
return prompt
# ── parse response ─────────────────────────────────────────────────────────────
def _parse_response(text: str) -> dict:
text = text.strip()
m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if m:
text = m.group(1)
else:
text = re.sub(r'^```(?:json)?\s*', '', text)
text = re.sub(r'\s*```$', '', text)
try:
return json.loads(text)
except json.JSONDecodeError:
# try extracting bare JSON object
m2 = re.search(r'\{.*\}', text, re.DOTALL)
if m2:
try:
return json.loads(m2.group(0))
except Exception:
pass
logger.error(f"Could not parse JSON from:\n{text[:500]}")
return {"score": 0, "final_completion": "False", "steps_analysis": []}
# ── main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.")
parser.add_argument("--task", required=True, help="Path to task JSON file")
parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots")
parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})")
parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)")
parser.add_argument("--no_compress", action="store_true", help="Disable image compression")
parser.add_argument("--output", default=None, help="Output JSON path (default: <result_dir>/reeval_result.json)")
args = parser.parse_args()
# Load task
with open(args.task, "r", encoding="utf-8") as f:
task = json.load(f)
instruction = task.get("instruction", "")
config = task.get("config", [])
metadata = task.get("metadata", {})
logger.info(f"Task: {task.get('id', '?')}")
logger.info(f"Instruction: {instruction}")
# Load screenshots
images_b64, name_list = _load_screenshots(
args.result_dir,
max_images=args.max_images,
compress=not args.no_compress,
)
# Build prompt
prompt = _build_prompt(instruction, config, metadata, name_list)
# Call LLM
logger.info(f"Calling {args.model} with {len(images_b64)} images...")
raw = _call_llm(args.model, prompt, images_b64)
logger.info(f"Raw response:\n{raw}")
# Parse
result = _parse_response(raw)
score_raw = float(result.get("score", 0))
score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2)
# Add metadata to output
result["_task_id"] = task.get("id", "")
result["_model"] = args.model
result["_frames_used"] = name_list
result["_score_normalized"] = score_norm
# Save
out_path = args.output or os.path.join(args.result_dir, "reeval_result.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Print summary
print("\n" + "="*60)
print(f"Task: {task.get('id', '?')}")
print(f"Frames used: {name_list}")
print(f"Final screenshot: {name_list[-1]}")
if "final_screenshot_description" in result:
print(f"Final state desc: {result['final_screenshot_description']}")
print(f"Task complete: {result.get('task_complete_in_final', '?')}")
print(f"final_completion: {result.get('final_completion', '?')}")
print(f"Score (0-10): {score_raw} → normalized: {score_norm}")
print(f"Result saved to: {out_path}")
print("="*60 + "\n")
if result.get("steps_analysis"):
print("Steps analysis:")
for s in result["steps_analysis"]:
status_icon = "" if s.get("status") == "Success" else ""
print(f" [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}")
if __name__ == "__main__":
main()