fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑

- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug，改为数字排序 - 对齐reeval.py的prompt逻辑：明确要求模型优先检查最终截图（STEP 1 EXAMINE FINAL SCREENSHOT FIRST） - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py：基于test_final.json批量重评测已有轨迹 - 新增reeval.py：单任务重评测脚本（final-frame-anchored evaluation） - test_final.json新增avogadro(11题)和origin(8题)
2026-03-27 14:25:45 +08:00
parent 4e192cf013
commit 252d2f79ce
5 changed files with 434 additions and 94 deletions
--- a/batch_reeval.py
+++ b/batch_reeval.py
@@ -0,0 +1,87 @@
+"""
+Batch re-evaluation based on test_final.json.
+
+Usage:
+    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
+    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
+    python3 batch_reeval.py --results_dir <path> --force   # re-run even if already done
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import glob
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
+TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
+REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_dir", required=True,
+                        help="e.g. /Volumes/.../screenshot/gpt-5.4")
+    parser.add_argument("--force", action="store_true",
+                        help="Re-evaluate even if reeval_result.json already exists")
+    args = parser.parse_args()
+
+    with open(TEST_FINAL) as f:
+        test_final = json.load(f)
+
+    # Build task list
+    tasks = []
+    skipped = []
+    for sw, task_ids in test_final.items():
+        for task_id in task_ids:
+            result_dir = os.path.join(args.results_dir, sw, task_id)
+            task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
+            reeval_out = os.path.join(result_dir, "reeval_result.json")
+
+            if not os.path.isdir(result_dir):
+                skipped.append(f"  NO result_dir:  {sw}/{task_id}")
+                continue
+            if not os.path.exists(task_json):
+                skipped.append(f"  NO task JSON:   {sw}/{task_id}")
+                continue
+            if not glob.glob(os.path.join(result_dir, "step_*.png")):
+                skipped.append(f"  NO screenshots: {sw}/{task_id}")
+                continue
+            if os.path.exists(reeval_out) and not args.force:
+                skipped.append(f"  already done:   {sw}/{task_id}")
+                continue
+
+            tasks.append((task_json, result_dir, sw, task_id))
+
+    print(f"Tasks to evaluate: {len(tasks)}")
+    if skipped:
+        print(f"Skipped ({len(skipped)}):")
+        for s in skipped:
+            print(s)
+    print()
+
+    ok, failed = 0, []
+    for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
+        print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
+        r = subprocess.run(
+            [sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
+            capture_output=True, text=True
+        )
+        if r.returncode != 0:
+            print(f"  ERROR: {r.stderr[-300:]}")
+            failed.append(f"{sw}/{task_id}")
+        else:
+            for line in r.stdout.splitlines():
+                if "normalized" in line or "Task complete" in line or "Score" in line:
+                    print(f"  {line.strip()}")
+            ok += 1
+
+    print(f"\nDone: {ok} succeeded, {len(failed)} failed")
+    if failed:
+        print("Failed tasks:", failed)
+
+
+if __name__ == "__main__":
+    main()