fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑
- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug,改为数字排序 - 对齐reeval.py的prompt逻辑:明确要求模型优先检查最终截图(STEP 1 EXAMINE FINAL SCREENSHOT FIRST) - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py:基于test_final.json批量重评测已有轨迹 - 新增reeval.py:单任务重评测脚本(final-frame-anchored evaluation) - test_final.json新增avogadro(11题)和origin(8题)
This commit is contained in:
87
batch_reeval.py
Normal file
87
batch_reeval.py
Normal file
@@ -0,0 +1,87 @@
|
||||
"""
|
||||
Batch re-evaluation based on test_final.json.
|
||||
|
||||
Usage:
|
||||
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
|
||||
python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
|
||||
python3 batch_reeval.py --results_dir <path> --force # re-run even if already done
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
import glob
|
||||
|
||||
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
|
||||
TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
|
||||
TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
|
||||
REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--results_dir", required=True,
|
||||
help="e.g. /Volumes/.../screenshot/gpt-5.4")
|
||||
parser.add_argument("--force", action="store_true",
|
||||
help="Re-evaluate even if reeval_result.json already exists")
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(TEST_FINAL) as f:
|
||||
test_final = json.load(f)
|
||||
|
||||
# Build task list
|
||||
tasks = []
|
||||
skipped = []
|
||||
for sw, task_ids in test_final.items():
|
||||
for task_id in task_ids:
|
||||
result_dir = os.path.join(args.results_dir, sw, task_id)
|
||||
task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
|
||||
reeval_out = os.path.join(result_dir, "reeval_result.json")
|
||||
|
||||
if not os.path.isdir(result_dir):
|
||||
skipped.append(f" NO result_dir: {sw}/{task_id}")
|
||||
continue
|
||||
if not os.path.exists(task_json):
|
||||
skipped.append(f" NO task JSON: {sw}/{task_id}")
|
||||
continue
|
||||
if not glob.glob(os.path.join(result_dir, "step_*.png")):
|
||||
skipped.append(f" NO screenshots: {sw}/{task_id}")
|
||||
continue
|
||||
if os.path.exists(reeval_out) and not args.force:
|
||||
skipped.append(f" already done: {sw}/{task_id}")
|
||||
continue
|
||||
|
||||
tasks.append((task_json, result_dir, sw, task_id))
|
||||
|
||||
print(f"Tasks to evaluate: {len(tasks)}")
|
||||
if skipped:
|
||||
print(f"Skipped ({len(skipped)}):")
|
||||
for s in skipped:
|
||||
print(s)
|
||||
print()
|
||||
|
||||
ok, failed = 0, []
|
||||
for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
|
||||
print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
|
||||
r = subprocess.run(
|
||||
[sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
|
||||
capture_output=True, text=True
|
||||
)
|
||||
if r.returncode != 0:
|
||||
print(f" ERROR: {r.stderr[-300:]}")
|
||||
failed.append(f"{sw}/{task_id}")
|
||||
else:
|
||||
for line in r.stdout.splitlines():
|
||||
if "normalized" in line or "Task complete" in line or "Score" in line:
|
||||
print(f" {line.strip()}")
|
||||
ok += 1
|
||||
|
||||
print(f"\nDone: {ok} succeeded, {len(failed)} failed")
|
||||
if failed:
|
||||
print("Failed tasks:", failed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user