sci-gui-agent-benchmark/batch_reeval.py

"""
Batch re-evaluation based on test_final.json.

Usage:
    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
    python3 batch_reeval.py --results_dir <path> --force   # re-run even if already done
"""

import argparse
import json
import os
import subprocess
import sys
import glob

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--results_dir", required=True,
                        help="e.g. /Volumes/.../screenshot/gpt-5.4")
    parser.add_argument("--force", action="store_true",
                        help="Re-evaluate even if reeval_result.json already exists")
    args = parser.parse_args()

    with open(TEST_FINAL) as f:
        test_final = json.load(f)

    # Build task list
    tasks = []
    skipped = []
    for sw, task_ids in test_final.items():
        for task_id in task_ids:
            result_dir = os.path.join(args.results_dir, sw, task_id)
            task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
            reeval_out = os.path.join(result_dir, "reeval_result.json")

            if not os.path.isdir(result_dir):
                skipped.append(f"  NO result_dir:  {sw}/{task_id}")
                continue
            if not os.path.exists(task_json):
                skipped.append(f"  NO task JSON:   {sw}/{task_id}")
                continue
            if not glob.glob(os.path.join(result_dir, "step_*.png")):
                skipped.append(f"  NO screenshots: {sw}/{task_id}")
                continue
            if os.path.exists(reeval_out) and not args.force:
                skipped.append(f"  already done:   {sw}/{task_id}")
                continue

            tasks.append((task_json, result_dir, sw, task_id))

    print(f"Tasks to evaluate: {len(tasks)}")
    if skipped:
        print(f"Skipped ({len(skipped)}):")
        for s in skipped:
            print(s)
    print()

    ok, failed = 0, []
    for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
        print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
        r = subprocess.run(
            [sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
            capture_output=True, text=True
        )
        if r.returncode != 0:
            print(f"  ERROR: {r.stderr[-300:]}")
            failed.append(f"{sw}/{task_id}")
        else:
            for line in r.stdout.splitlines():
                if "normalized" in line or "Task complete" in line or "Score" in line:
                    print(f"  {line.strip()}")
            ok += 1

    print(f"\nDone: {ok} succeeded, {len(failed)} failed")
    if failed:
        print("Failed tasks:", failed)


if __name__ == "__main__":
    main()