- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序 - 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST) - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹 - 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation) - test_final.json新增avogadro(11题)和origin(8题)
88 lines
3.1 KiB
Python
88 lines
3.1 KiB
Python
"""
|
|
Batch re-evaluation based on test_final.json.
|
|
|
|
Usage:
|
|
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
|
|
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
|
|
python3 batch_reeval.py --results_dir <path> --force # re-run even if already done
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import subprocess
|
|
import sys
|
|
import glob
|
|
|
|
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
|
TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
|
|
TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
|
|
REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--results_dir", required=True,
|
|
help="e.g. /Volumes/.../screenshot/gpt-5.4")
|
|
parser.add_argument("--force", action="store_true",
|
|
help="Re-evaluate even if reeval_result.json already exists")
|
|
args = parser.parse_args()
|
|
|
|
with open(TEST_FINAL) as f:
|
|
test_final = json.load(f)
|
|
|
|
# Build task list
|
|
tasks = []
|
|
skipped = []
|
|
for sw, task_ids in test_final.items():
|
|
for task_id in task_ids:
|
|
result_dir = os.path.join(args.results_dir, sw, task_id)
|
|
task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
|
|
reeval_out = os.path.join(result_dir, "reeval_result.json")
|
|
|
|
if not os.path.isdir(result_dir):
|
|
skipped.append(f" NO result_dir: {sw}/{task_id}")
|
|
continue
|
|
if not os.path.exists(task_json):
|
|
skipped.append(f" NO task JSON: {sw}/{task_id}")
|
|
continue
|
|
if not glob.glob(os.path.join(result_dir, "step_*.png")):
|
|
skipped.append(f" NO screenshots: {sw}/{task_id}")
|
|
continue
|
|
if os.path.exists(reeval_out) and not args.force:
|
|
skipped.append(f" already done: {sw}/{task_id}")
|
|
continue
|
|
|
|
tasks.append((task_json, result_dir, sw, task_id))
|
|
|
|
print(f"Tasks to evaluate: {len(tasks)}")
|
|
if skipped:
|
|
print(f"Skipped ({len(skipped)}):")
|
|
for s in skipped:
|
|
print(s)
|
|
print()
|
|
|
|
ok, failed = 0, []
|
|
for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
|
|
print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
|
|
r = subprocess.run(
|
|
[sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
|
|
capture_output=True, text=True
|
|
)
|
|
if r.returncode != 0:
|
|
print(f" ERROR: {r.stderr[-300:]}")
|
|
failed.append(f"{sw}/{task_id}")
|
|
else:
|
|
for line in r.stdout.splitlines():
|
|
if "normalized" in line or "Task complete" in line or "Score" in line:
|
|
print(f" {line.strip()}")
|
|
ok += 1
|
|
|
|
print(f"\nDone: {ok} succeeded, {len(failed)} failed")
|
|
if failed:
|
|
print("Failed tasks:", failed)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|