fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑
- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序 - 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST) - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹 - 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation) - test_final.json新增avogadro(11题)和origin(8题)
This commit is contained in:
309
reeval.py
Normal file
309
reeval.py
Normal file
@@ -0,0 +1,309 @@
|
||||
"""
|
||||
Re-evaluation script for vllm_eval tasks.
|
||||
|
||||
Usage:
|
||||
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder
|
||||
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import glob
|
||||
import base64
|
||||
import re
|
||||
import logging
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
logger = logging.getLogger("reeval")
|
||||
|
||||
MAX_IMAGES = 10 # hard API limit
|
||||
|
||||
# Defaults matching run_proxmox.sh
|
||||
_DEFAULT_API_KEY = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
|
||||
_DEFAULT_BASE_URL = "https://vip.apiyi.com/v1"
|
||||
_DEFAULT_MODEL = "gemini-3.1-pro-preview"
|
||||
|
||||
|
||||
# ── image helpers ──────────────────────────────────────────────────────────────
|
||||
|
||||
def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str:
|
||||
try:
|
||||
img_data = base64.b64decode(img_b64)
|
||||
img = Image.open(BytesIO(img_data))
|
||||
if img.mode in ("RGBA", "LA", "P"):
|
||||
bg = Image.new("RGB", img.size, (255, 255, 255))
|
||||
if img.mode == "P":
|
||||
img = img.convert("RGBA")
|
||||
bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None)
|
||||
img = bg
|
||||
if max(img.size) > max_size:
|
||||
ratio = max_size / max(img.size)
|
||||
img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS)
|
||||
buf = BytesIO()
|
||||
img.save(buf, format="JPEG", quality=quality, optimize=True)
|
||||
return base64.b64encode(buf.getvalue()).decode()
|
||||
except Exception as e:
|
||||
logger.warning(f"Compression failed: {e}")
|
||||
return img_b64
|
||||
|
||||
|
||||
def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True):
|
||||
"""
|
||||
Load up to max_images screenshots, always keeping FIRST and LAST,
|
||||
sampling middle frames evenly. Returns (b64_list, name_list).
|
||||
"""
|
||||
pattern = os.path.join(result_dir, "step_*.png")
|
||||
files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1)))
|
||||
|
||||
if not files:
|
||||
raise FileNotFoundError(f"No step_*.png found in {result_dir}")
|
||||
|
||||
n = len(files)
|
||||
logger.info(f"Found {n} screenshots in {result_dir}")
|
||||
|
||||
if n <= max_images:
|
||||
selected = files
|
||||
else:
|
||||
# Always keep first + last; fill rest with evenly-spaced middle frames
|
||||
middle_slots = max_images - 2
|
||||
step = (n - 2) / (middle_slots + 1)
|
||||
indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1]
|
||||
indices = sorted(set(indices))
|
||||
selected = [files[i] for i in indices]
|
||||
|
||||
b64_list, name_list = [], []
|
||||
for fp in selected:
|
||||
with open(fp, "rb") as f:
|
||||
b64 = base64.b64encode(f.read()).decode()
|
||||
if compress:
|
||||
b64 = _compress_image(b64)
|
||||
b64_list.append(b64)
|
||||
m = re.match(r"(step_\d+)", os.path.basename(fp))
|
||||
name_list.append(m.group(1) if m else os.path.basename(fp))
|
||||
|
||||
logger.info(f"Selected {len(name_list)} frames: {name_list}")
|
||||
return b64_list, name_list
|
||||
|
||||
|
||||
# ── LLM call ──────────────────────────────────────────────────────────────────
|
||||
|
||||
def _call_llm(model: str, prompt: str, images_b64: list) -> str:
|
||||
from dotenv import load_dotenv
|
||||
load_dotenv()
|
||||
|
||||
api_key = os.getenv("OPENAI_API_KEY", _DEFAULT_API_KEY)
|
||||
base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL)
|
||||
|
||||
# gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com)
|
||||
if model.startswith("gpt") or model.startswith("gemini"):
|
||||
from openai import OpenAI
|
||||
client = OpenAI(api_key=api_key, base_url=base_url)
|
||||
content = [{"type": "text", "text": prompt}]
|
||||
for b64 in images_b64:
|
||||
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}})
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": content}],
|
||||
temperature=0.2,
|
||||
max_tokens=4096,
|
||||
)
|
||||
return resp.choices[0].message.content
|
||||
|
||||
elif model.startswith("claude"):
|
||||
from anthropic import Anthropic
|
||||
client = Anthropic(
|
||||
base_url=os.getenv("ANTHROPIC_BASE_URL"),
|
||||
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
||||
)
|
||||
content = [{"type": "text", "text": prompt}]
|
||||
for b64 in images_b64:
|
||||
content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}})
|
||||
resp = client.messages.create(
|
||||
model=model,
|
||||
messages=[{"role": "user", "content": content}],
|
||||
temperature=0.2,
|
||||
max_tokens=4096,
|
||||
)
|
||||
return resp.content[0].text
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown model prefix: {model}")
|
||||
|
||||
|
||||
# ── prompt builder ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str:
|
||||
# Pre-config section
|
||||
preconfig_lines = []
|
||||
for cfg in config:
|
||||
if cfg.get("type") == "launch":
|
||||
cmds = cfg.get("parameters", {}).get("command", [])
|
||||
if cmds:
|
||||
preconfig_lines.append(f" - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).")
|
||||
elif cfg.get("type") == "open":
|
||||
path = cfg.get("parameters", {}).get("path", "")
|
||||
preconfig_lines.append(f" - '{path}' was auto-opened (NOT agent's work).")
|
||||
preconfig_section = (
|
||||
"PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" +
|
||||
"\n".join(preconfig_lines)
|
||||
) if preconfig_lines else ""
|
||||
|
||||
expected_steps = metadata.get("steps", "")
|
||||
expected_section = (
|
||||
f"EXPECTED STEPS (reference only):\n{expected_steps}"
|
||||
) if expected_steps else ""
|
||||
|
||||
final_name = name_list[-1]
|
||||
img_list_str = ", ".join(name_list)
|
||||
|
||||
prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
|
||||
|
||||
Task: {instruction}
|
||||
|
||||
{preconfig_section}
|
||||
|
||||
{expected_section}
|
||||
|
||||
You are provided with {len(name_list)} screenshots in chronological order: {img_list_str}
|
||||
|
||||
════════════════════════════════════════════════════
|
||||
STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
|
||||
The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
|
||||
Look at this image carefully NOW before anything else. Ask yourself:
|
||||
"Does this final screenshot show the task is complete?"
|
||||
Only after answering that, use earlier screenshots to understand HOW the agent got there.
|
||||
════════════════════════════════════════════════════
|
||||
|
||||
SCORING RULES:
|
||||
1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
|
||||
2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
|
||||
3. Require VISIBLE evidence in a specific screenshot for each step.
|
||||
4. If the final screenshot shows the task is done, score high even if earlier steps were messy.
|
||||
|
||||
SCORE GUIDE (0–10):
|
||||
- 0: No agent progress; only pre-configured state visible.
|
||||
- 1–3: Minor actions taken, far from goal.
|
||||
- 4–6: Meaningful progress, roughly half done.
|
||||
- 7–8: Most steps done, minor issues.
|
||||
- 9: Essentially complete, cosmetic differences only.
|
||||
- 10: Fully and perfectly complete with clear visual proof in the final screenshot.
|
||||
|
||||
Respond with ONLY valid JSON (no extra text):
|
||||
|
||||
{{
|
||||
"final_screenshot": "{final_name}",
|
||||
"final_screenshot_description": "Describe exactly what you see in {final_name}",
|
||||
"task_complete_in_final": true/false,
|
||||
"steps_analysis": [
|
||||
{{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
|
||||
],
|
||||
"final_completion": "True/False",
|
||||
"score": 0-10
|
||||
}}"""
|
||||
return prompt
|
||||
|
||||
|
||||
# ── parse response ─────────────────────────────────────────────────────────────
|
||||
|
||||
def _parse_response(text: str) -> dict:
|
||||
text = text.strip()
|
||||
m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
||||
if m:
|
||||
text = m.group(1)
|
||||
else:
|
||||
text = re.sub(r'^```(?:json)?\s*', '', text)
|
||||
text = re.sub(r'\s*```$', '', text)
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
# try extracting bare JSON object
|
||||
m2 = re.search(r'\{.*\}', text, re.DOTALL)
|
||||
if m2:
|
||||
try:
|
||||
return json.loads(m2.group(0))
|
||||
except Exception:
|
||||
pass
|
||||
logger.error(f"Could not parse JSON from:\n{text[:500]}")
|
||||
return {"score": 0, "final_completion": "False", "steps_analysis": []}
|
||||
|
||||
|
||||
# ── main ──────────────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.")
|
||||
parser.add_argument("--task", required=True, help="Path to task JSON file")
|
||||
parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots")
|
||||
parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})")
|
||||
parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)")
|
||||
parser.add_argument("--no_compress", action="store_true", help="Disable image compression")
|
||||
parser.add_argument("--output", default=None, help="Output JSON path (default: <result_dir>/reeval_result.json)")
|
||||
args = parser.parse_args()
|
||||
|
||||
# Load task
|
||||
with open(args.task, "r", encoding="utf-8") as f:
|
||||
task = json.load(f)
|
||||
|
||||
instruction = task.get("instruction", "")
|
||||
config = task.get("config", [])
|
||||
metadata = task.get("metadata", {})
|
||||
|
||||
logger.info(f"Task: {task.get('id', '?')}")
|
||||
logger.info(f"Instruction: {instruction}")
|
||||
|
||||
# Load screenshots
|
||||
images_b64, name_list = _load_screenshots(
|
||||
args.result_dir,
|
||||
max_images=args.max_images,
|
||||
compress=not args.no_compress,
|
||||
)
|
||||
|
||||
# Build prompt
|
||||
prompt = _build_prompt(instruction, config, metadata, name_list)
|
||||
|
||||
# Call LLM
|
||||
logger.info(f"Calling {args.model} with {len(images_b64)} images...")
|
||||
raw = _call_llm(args.model, prompt, images_b64)
|
||||
logger.info(f"Raw response:\n{raw}")
|
||||
|
||||
# Parse
|
||||
result = _parse_response(raw)
|
||||
score_raw = float(result.get("score", 0))
|
||||
score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2)
|
||||
|
||||
# Add metadata to output
|
||||
result["_task_id"] = task.get("id", "")
|
||||
result["_model"] = args.model
|
||||
result["_frames_used"] = name_list
|
||||
result["_score_normalized"] = score_norm
|
||||
|
||||
# Save
|
||||
out_path = args.output or os.path.join(args.result_dir, "reeval_result.json")
|
||||
with open(out_path, "w", encoding="utf-8") as f:
|
||||
json.dump(result, f, ensure_ascii=False, indent=2)
|
||||
|
||||
# Print summary
|
||||
print("\n" + "="*60)
|
||||
print(f"Task: {task.get('id', '?')}")
|
||||
print(f"Frames used: {name_list}")
|
||||
print(f"Final screenshot: {name_list[-1]}")
|
||||
if "final_screenshot_description" in result:
|
||||
print(f"Final state desc: {result['final_screenshot_description']}")
|
||||
print(f"Task complete: {result.get('task_complete_in_final', '?')}")
|
||||
print(f"final_completion: {result.get('final_completion', '?')}")
|
||||
print(f"Score (0-10): {score_raw} → normalized: {score_norm}")
|
||||
print(f"Result saved to: {out_path}")
|
||||
print("="*60 + "\n")
|
||||
|
||||
if result.get("steps_analysis"):
|
||||
print("Steps analysis:")
|
||||
for s in result["steps_analysis"]:
|
||||
status_icon = "✓" if s.get("status") == "Success" else "✗"
|
||||
print(f" [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user