fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑

- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序
- 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST)
- 评估temperature从0.7降至0.2提升一致性
- 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹
- 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation)
- test_final.json新增avogadro(11题)和origin(8题)
This commit is contained in:
2026-03-27 14:25:45 +08:00
parent 4e192cf013
commit 252d2f79ce
5 changed files with 434 additions and 94 deletions

87
batch_reeval.py Normal file
View File

@@ -0,0 +1,87 @@
"""
Batch re-evaluation based on test_final.json.
Usage:
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
python3 batch_reeval.py --results_dir <path> --force # re-run even if already done
"""
import argparse
import json
import os
import subprocess
import sys
import glob
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--results_dir", required=True,
help="e.g. /Volumes/.../screenshot/gpt-5.4")
parser.add_argument("--force", action="store_true",
help="Re-evaluate even if reeval_result.json already exists")
args = parser.parse_args()
with open(TEST_FINAL) as f:
test_final = json.load(f)
# Build task list
tasks = []
skipped = []
for sw, task_ids in test_final.items():
for task_id in task_ids:
result_dir = os.path.join(args.results_dir, sw, task_id)
task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
reeval_out = os.path.join(result_dir, "reeval_result.json")
if not os.path.isdir(result_dir):
skipped.append(f" NO result_dir: {sw}/{task_id}")
continue
if not os.path.exists(task_json):
skipped.append(f" NO task JSON: {sw}/{task_id}")
continue
if not glob.glob(os.path.join(result_dir, "step_*.png")):
skipped.append(f" NO screenshots: {sw}/{task_id}")
continue
if os.path.exists(reeval_out) and not args.force:
skipped.append(f" already done: {sw}/{task_id}")
continue
tasks.append((task_json, result_dir, sw, task_id))
print(f"Tasks to evaluate: {len(tasks)}")
if skipped:
print(f"Skipped ({len(skipped)}):")
for s in skipped:
print(s)
print()
ok, failed = 0, []
for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
r = subprocess.run(
[sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
capture_output=True, text=True
)
if r.returncode != 0:
print(f" ERROR: {r.stderr[-300:]}")
failed.append(f"{sw}/{task_id}")
else:
for line in r.stdout.splitlines():
if "normalized" in line or "Task complete" in line or "Score" in line:
print(f" {line.strip()}")
ok += 1
print(f"\nDone: {ok} succeeded, {len(failed)} failed")
if failed:
print("Failed tasks:", failed)
if __name__ == "__main__":
main()

View File

@@ -347,8 +347,13 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = False, max_size
filenames = []
# Find all step screenshot files (e.g., step_1_20240101@120000.png)
# Sort numerically by step number to avoid lexicographic issues (step_10 < step_2 in string sort)
import re as _re_sort
pattern = os.path.join(result_dir, "step_*.png")
screenshot_files = sorted(glob.glob(pattern))
screenshot_files = sorted(
glob.glob(pattern),
key=lambda p: int(_re_sort.search(r"step_(\d+)", os.path.basename(p)).group(1))
)
if not screenshot_files:
logger.warning(f"No screenshot files found in {result_dir}")
@@ -446,7 +451,7 @@ def vllm_eval(result_state, **options) -> float:
metadata = options.get("metadata", {})
params = {
"temperature": options.get("temperature", 0.7),
"temperature": options.get("temperature", 0.2),
"max_tokens": options.get("max_tokens", 16384),
"top_p": options.get("top_p", 1.0)
}
@@ -493,51 +498,49 @@ IMPORTANT: Only reference screenshots from the list above. Do NOT reference any
else:
img_info = "\nNo screenshots were provided."
prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots.
final_name = screenshot_filenames[-1] if screenshot_filenames else "N/A"
Task Instruction: {instruction}
prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
Task: {instruction}
{preconfig_section}
{expected_steps_section}
{img_info}
Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context.
════════════════════════════════════════════════════
STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
Look at this image carefully NOW before anything else. Ask yourself:
"Does this final screenshot show the task is complete?"
Only after answering that, use earlier screenshots to understand HOW the agent got there.
════════════════════════════════════════════════════
CRITICAL SCORING RULES:
1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points.
2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress.
3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0.
4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress.
5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal.
SCORING RULES:
1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
3. Require VISIBLE evidence in a specific screenshot for each step.
4. If the final screenshot shows the task is done, score high even if earlier steps were messy.
SCORING GUIDE (0-10):
- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0.
- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal.
- 3-4: Some initial steps toward the task have been taken but the task is far from complete.
- 5-6: Significant progress - about half the required steps are completed with visible evidence.
- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues.
- 9: The task is essentially complete with very minor cosmetic differences.
- 10: The task is perfectly and completely finished with clear evidence in the final screenshot.
SCORE GUIDE (0-10):
- 0: No agent progress; only pre-configured state visible.
- 1-3: Minor actions taken, far from goal.
- 4-6: Meaningful progress, roughly half done.
- 7-8: Most steps done, minor issues.
- 9: Essentially complete, cosmetic differences only.
- 10: Fully and perfectly complete with clear visual proof in the final screenshot.
IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
Respond with ONLY valid JSON (no extra text):
{{
"final_screenshot": "{final_name}",
"final_screenshot_description": "Describe exactly what you see in {final_name}",
"task_complete_in_final": true/false,
"steps_analysis": [
{{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}},
{{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}}
{{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
],
"final_completion": "True/False",
"score": 0-10
}}
Where:
- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps.
- "status": Either "Success" or "Fail" for each step
- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence
- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise
- "score": Integer from 0 to 10, following the strict scoring guide above
Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER."""
}}"""
try:
result = llm.generate_with_images(

View File

@@ -1,59 +0,0 @@
{
"id": "Origin_User_Guide_2025b_E_task1",
"snapshot": "origin",
"instruction": "在 Origin 中通过 Data → Connect to File 导入一个本地 Excel 文件 example.xlsx",
"source": "custom",
"config": [
{
"type": "upload_file",
"parameters": {
"files": [
{
"local_path": "evaluation_examples/data/origin/example.xlsx",
"path": "C:\\Users\\user\\Desktop\\example.xlsx"
}
]
}
},
{
"type": "launch",
"parameters": {
"command": [
"C:\\Program Files\\OriginLab\\Origin2025b\\Origin64.exe",
"C:\\Users\\user\\Desktop\\example.xlsx"
]
}
},
{
"type": "sleep",
"parameters": {
"seconds": 5
}
}
],
"trajectory": "trajectories/",
"related_apps": [
"origin"
],
"evaluator": {
"postconfig": [
{
"type": "sleep",
"parameters": {
"seconds": 3
}
}
],
"func": "vllm_eval"
},
"proxy": false,
"fixed_ip": false,
"possibility_of_env_change": "low",
"metadata": {
"input_files": [
"example.xlsx"
],
"steps": "1. 单击顶部主菜单栏中的 \"Data\" 菜单。\n2. 在展开的下拉菜单中,将鼠标悬停或单击 \"Connect to File\" 菜单项以展开子菜单。\n3. 在展开的子菜单中,单击选中 \"Excel...\" 选项。\n4. 在弹出的文件选择对话框中,单击选中文件名输入框将光标定位至此。\n5. 在文件名输入框中,输入文字 \"example.xlsx\"。\n6. 单击对话框右下角的 \"Open\"(或\"打开\")按钮。 \n7. 单击新弹出对话框中的 \"OK\" 按钮。",
"steps_original": "1. 在 Origin 的主菜单中选择 Data → Connect to File。\n2. 点击 Connect to File 菜单中的按钮。\n3. 选择文件 example.xlsx 并点击 Open。\n4. 数据将被加载到当前的工作表中。"
}
}

View File

@@ -26,7 +26,7 @@
],
"origin": [
"Origin_User_Guide_2025b_E_task2",
"Origin_User_Guide_2025b_E_task3",
"Origin_User_Guide_2025b_E_task3",
"Origin_User_Guide_2025b_E_task4",
"Origin_User_Guide_2025b_E_task5",
"Origin_User_Guide_2025b_E_task8",
@@ -70,4 +70,4 @@
"viewports_task10",
"viewports_task11"
]
}
}

309
reeval.py Normal file
View File

@@ -0,0 +1,309 @@
"""
Re-evaluation script for vllm_eval tasks.
Usage:
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder
python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10
"""
import argparse
import json
import os
import sys
import glob
import base64
import re
import logging
from io import BytesIO
from PIL import Image
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
logger = logging.getLogger("reeval")
MAX_IMAGES = 10 # hard API limit
# Defaults matching run_proxmox.sh
_DEFAULT_API_KEY = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
_DEFAULT_BASE_URL = "https://vip.apiyi.com/v1"
_DEFAULT_MODEL = "gemini-3.1-pro-preview"
# ── image helpers ──────────────────────────────────────────────────────────────
def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str:
try:
img_data = base64.b64decode(img_b64)
img = Image.open(BytesIO(img_data))
if img.mode in ("RGBA", "LA", "P"):
bg = Image.new("RGB", img.size, (255, 255, 255))
if img.mode == "P":
img = img.convert("RGBA")
bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None)
img = bg
if max(img.size) > max_size:
ratio = max_size / max(img.size)
img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS)
buf = BytesIO()
img.save(buf, format="JPEG", quality=quality, optimize=True)
return base64.b64encode(buf.getvalue()).decode()
except Exception as e:
logger.warning(f"Compression failed: {e}")
return img_b64
def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True):
"""
Load up to max_images screenshots, always keeping FIRST and LAST,
sampling middle frames evenly. Returns (b64_list, name_list).
"""
pattern = os.path.join(result_dir, "step_*.png")
files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1)))
if not files:
raise FileNotFoundError(f"No step_*.png found in {result_dir}")
n = len(files)
logger.info(f"Found {n} screenshots in {result_dir}")
if n <= max_images:
selected = files
else:
# Always keep first + last; fill rest with evenly-spaced middle frames
middle_slots = max_images - 2
step = (n - 2) / (middle_slots + 1)
indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1]
indices = sorted(set(indices))
selected = [files[i] for i in indices]
b64_list, name_list = [], []
for fp in selected:
with open(fp, "rb") as f:
b64 = base64.b64encode(f.read()).decode()
if compress:
b64 = _compress_image(b64)
b64_list.append(b64)
m = re.match(r"(step_\d+)", os.path.basename(fp))
name_list.append(m.group(1) if m else os.path.basename(fp))
logger.info(f"Selected {len(name_list)} frames: {name_list}")
return b64_list, name_list
# ── LLM call ──────────────────────────────────────────────────────────────────
def _call_llm(model: str, prompt: str, images_b64: list) -> str:
from dotenv import load_dotenv
load_dotenv()
api_key = os.getenv("OPENAI_API_KEY", _DEFAULT_API_KEY)
base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL)
# gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com)
if model.startswith("gpt") or model.startswith("gemini"):
from openai import OpenAI
client = OpenAI(api_key=api_key, base_url=base_url)
content = [{"type": "text", "text": prompt}]
for b64 in images_b64:
content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}})
resp = client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": content}],
temperature=0.2,
max_tokens=4096,
)
return resp.choices[0].message.content
elif model.startswith("claude"):
from anthropic import Anthropic
client = Anthropic(
base_url=os.getenv("ANTHROPIC_BASE_URL"),
api_key=os.getenv("ANTHROPIC_API_KEY"),
)
content = [{"type": "text", "text": prompt}]
for b64 in images_b64:
content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}})
resp = client.messages.create(
model=model,
messages=[{"role": "user", "content": content}],
temperature=0.2,
max_tokens=4096,
)
return resp.content[0].text
else:
raise ValueError(f"Unknown model prefix: {model}")
# ── prompt builder ─────────────────────────────────────────────────────────────
def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str:
# Pre-config section
preconfig_lines = []
for cfg in config:
if cfg.get("type") == "launch":
cmds = cfg.get("parameters", {}).get("command", [])
if cmds:
preconfig_lines.append(f" - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).")
elif cfg.get("type") == "open":
path = cfg.get("parameters", {}).get("path", "")
preconfig_lines.append(f" - '{path}' was auto-opened (NOT agent's work).")
preconfig_section = (
"PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" +
"\n".join(preconfig_lines)
) if preconfig_lines else ""
expected_steps = metadata.get("steps", "")
expected_section = (
f"EXPECTED STEPS (reference only):\n{expected_steps}"
) if expected_steps else ""
final_name = name_list[-1]
img_list_str = ", ".join(name_list)
prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
Task: {instruction}
{preconfig_section}
{expected_section}
You are provided with {len(name_list)} screenshots in chronological order: {img_list_str}
════════════════════════════════════════════════════
STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
Look at this image carefully NOW before anything else. Ask yourself:
"Does this final screenshot show the task is complete?"
Only after answering that, use earlier screenshots to understand HOW the agent got there.
════════════════════════════════════════════════════
SCORING RULES:
1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
3. Require VISIBLE evidence in a specific screenshot for each step.
4. If the final screenshot shows the task is done, score high even if earlier steps were messy.
SCORE GUIDE (010):
- 0: No agent progress; only pre-configured state visible.
- 13: Minor actions taken, far from goal.
- 46: Meaningful progress, roughly half done.
- 78: Most steps done, minor issues.
- 9: Essentially complete, cosmetic differences only.
- 10: Fully and perfectly complete with clear visual proof in the final screenshot.
Respond with ONLY valid JSON (no extra text):
{{
"final_screenshot": "{final_name}",
"final_screenshot_description": "Describe exactly what you see in {final_name}",
"task_complete_in_final": true/false,
"steps_analysis": [
{{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
],
"final_completion": "True/False",
"score": 0-10
}}"""
return prompt
# ── parse response ─────────────────────────────────────────────────────────────
def _parse_response(text: str) -> dict:
text = text.strip()
m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if m:
text = m.group(1)
else:
text = re.sub(r'^```(?:json)?\s*', '', text)
text = re.sub(r'\s*```$', '', text)
try:
return json.loads(text)
except json.JSONDecodeError:
# try extracting bare JSON object
m2 = re.search(r'\{.*\}', text, re.DOTALL)
if m2:
try:
return json.loads(m2.group(0))
except Exception:
pass
logger.error(f"Could not parse JSON from:\n{text[:500]}")
return {"score": 0, "final_completion": "False", "steps_analysis": []}
# ── main ──────────────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.")
parser.add_argument("--task", required=True, help="Path to task JSON file")
parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots")
parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})")
parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)")
parser.add_argument("--no_compress", action="store_true", help="Disable image compression")
parser.add_argument("--output", default=None, help="Output JSON path (default: <result_dir>/reeval_result.json)")
args = parser.parse_args()
# Load task
with open(args.task, "r", encoding="utf-8") as f:
task = json.load(f)
instruction = task.get("instruction", "")
config = task.get("config", [])
metadata = task.get("metadata", {})
logger.info(f"Task: {task.get('id', '?')}")
logger.info(f"Instruction: {instruction}")
# Load screenshots
images_b64, name_list = _load_screenshots(
args.result_dir,
max_images=args.max_images,
compress=not args.no_compress,
)
# Build prompt
prompt = _build_prompt(instruction, config, metadata, name_list)
# Call LLM
logger.info(f"Calling {args.model} with {len(images_b64)} images...")
raw = _call_llm(args.model, prompt, images_b64)
logger.info(f"Raw response:\n{raw}")
# Parse
result = _parse_response(raw)
score_raw = float(result.get("score", 0))
score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2)
# Add metadata to output
result["_task_id"] = task.get("id", "")
result["_model"] = args.model
result["_frames_used"] = name_list
result["_score_normalized"] = score_norm
# Save
out_path = args.output or os.path.join(args.result_dir, "reeval_result.json")
with open(out_path, "w", encoding="utf-8") as f:
json.dump(result, f, ensure_ascii=False, indent=2)
# Print summary
print("\n" + "="*60)
print(f"Task: {task.get('id', '?')}")
print(f"Frames used: {name_list}")
print(f"Final screenshot: {name_list[-1]}")
if "final_screenshot_description" in result:
print(f"Final state desc: {result['final_screenshot_description']}")
print(f"Task complete: {result.get('task_complete_in_final', '?')}")
print(f"final_completion: {result.get('final_completion', '?')}")
print(f"Score (0-10): {score_raw} → normalized: {score_norm}")
print(f"Result saved to: {out_path}")
print("="*60 + "\n")
if result.get("steps_analysis"):
print("Steps analysis:")
for s in result["steps_analysis"]:
status_icon = "" if s.get("status") == "Success" else ""
print(f" [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}")
if __name__ == "__main__":
main()