""" Batch re-evaluation based on test_final.json. Usage: python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4 python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4 python3 batch_reeval.py --results_dir --force # re-run even if already done """ import argparse import json import os import subprocess import sys import glob SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json") TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples") REEVAL = os.path.join(SCRIPT_DIR, "reeval.py") def main(): parser = argparse.ArgumentParser() parser.add_argument("--results_dir", required=True, help="e.g. /Volumes/.../screenshot/gpt-5.4") parser.add_argument("--force", action="store_true", help="Re-evaluate even if reeval_result.json already exists") args = parser.parse_args() with open(TEST_FINAL) as f: test_final = json.load(f) # Build task list tasks = [] skipped = [] for sw, task_ids in test_final.items(): for task_id in task_ids: result_dir = os.path.join(args.results_dir, sw, task_id) task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json") reeval_out = os.path.join(result_dir, "reeval_result.json") if not os.path.isdir(result_dir): skipped.append(f" NO result_dir: {sw}/{task_id}") continue if not os.path.exists(task_json): skipped.append(f" NO task JSON: {sw}/{task_id}") continue if not glob.glob(os.path.join(result_dir, "step_*.png")): skipped.append(f" NO screenshots: {sw}/{task_id}") continue if os.path.exists(reeval_out) and not args.force: skipped.append(f" already done: {sw}/{task_id}") continue tasks.append((task_json, result_dir, sw, task_id)) print(f"Tasks to evaluate: {len(tasks)}") if skipped: print(f"Skipped ({len(skipped)}):") for s in skipped: print(s) print() ok, failed = 0, [] for i, (task_json, result_dir, sw, task_id) in enumerate(tasks): print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}") r = subprocess.run( [sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir], capture_output=True, text=True ) if r.returncode != 0: print(f" ERROR: {r.stderr[-300:]}") failed.append(f"{sw}/{task_id}") else: for line in r.stdout.splitlines(): if "normalized" in line or "Task complete" in line or "Score" in line: print(f" {line.strip()}") ok += 1 print(f"\nDone: {ok} succeeded, {len(failed)} failed") if failed: print("Failed tasks:", failed) if __name__ == "__main__": main()