Files
sci-gui-agent-benchmark/batch_reeval.py
lizhanyuan 252d2f79ce fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑
- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序
- 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST)
- 评估temperature从0.7降至0.2提升一致性
- 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹
- 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation)
- test_final.json新增avogadro(11题)和origin(8题)
2026-03-27 14:34:32 +08:00

88 lines
3.1 KiB
Python

"""
Batch re-evaluation based on test_final.json.
Usage:
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
python3 batch_reeval.py --results_dir <path> --force # re-run even if already done
"""
import argparse
import json
import os
import subprocess
import sys
import glob
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--results_dir", required=True,
help="e.g. /Volumes/.../screenshot/gpt-5.4")
parser.add_argument("--force", action="store_true",
help="Re-evaluate even if reeval_result.json already exists")
args = parser.parse_args()
with open(TEST_FINAL) as f:
test_final = json.load(f)
# Build task list
tasks = []
skipped = []
for sw, task_ids in test_final.items():
for task_id in task_ids:
result_dir = os.path.join(args.results_dir, sw, task_id)
task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
reeval_out = os.path.join(result_dir, "reeval_result.json")
if not os.path.isdir(result_dir):
skipped.append(f" NO result_dir: {sw}/{task_id}")
continue
if not os.path.exists(task_json):
skipped.append(f" NO task JSON: {sw}/{task_id}")
continue
if not glob.glob(os.path.join(result_dir, "step_*.png")):
skipped.append(f" NO screenshots: {sw}/{task_id}")
continue
if os.path.exists(reeval_out) and not args.force:
skipped.append(f" already done: {sw}/{task_id}")
continue
tasks.append((task_json, result_dir, sw, task_id))
print(f"Tasks to evaluate: {len(tasks)}")
if skipped:
print(f"Skipped ({len(skipped)}):")
for s in skipped:
print(s)
print()
ok, failed = 0, []
for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
r = subprocess.run(
[sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
capture_output=True, text=True
)
if r.returncode != 0:
print(f" ERROR: {r.stderr[-300:]}")
failed.append(f"{sw}/{task_id}")
else:
for line in r.stdout.splitlines():
if "normalized" in line or "Task complete" in line or "Score" in line:
print(f" {line.strip()}")
ok += 1
print(f"\nDone: {ok} succeeded, {len(failed)} failed")
if failed:
print("Failed tasks:", failed)
if __name__ == "__main__":
main()