fix(eval): 修复vllm_eval截图排序bug并对齐reeval逻辑

- 修复_load_screenshots_from_dir中截图按字符串排序导致step_9被误判为最终帧的bug，改为数字排序 - 对齐reeval.py的prompt逻辑：明确要求模型优先检查最终截图（STEP 1 EXAMINE FINAL SCREENSHOT FIRST） - 评估temperature从0.7降至0.2提升一致性 - 新增batch_reeval.py：基于test_final.json批量重评测已有轨迹 - 新增reeval.py：单任务重评测脚本（final-frame-anchored evaluation） - test_final.json新增avogadro(11题)和origin(8题)
2026-03-27 14:25:45 +08:00
parent 4e192cf013
commit 252d2f79ce
5 changed files with 434 additions and 94 deletions
--- a/batch_reeval.py
+++ b/batch_reeval.py
@@ -0,0 +1,87 @@
+"""
+Batch re-evaluation based on test_final.json.
+
+Usage:
+    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4
+    python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4
+    python3 batch_reeval.py --results_dir <path> --force   # re-run even if already done
+"""
+
+import argparse
+import json
+import os
+import subprocess
+import sys
+import glob
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json")
+TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples")
+REEVAL = os.path.join(SCRIPT_DIR, "reeval.py")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--results_dir", required=True,
+                        help="e.g. /Volumes/.../screenshot/gpt-5.4")
+    parser.add_argument("--force", action="store_true",
+                        help="Re-evaluate even if reeval_result.json already exists")
+    args = parser.parse_args()
+
+    with open(TEST_FINAL) as f:
+        test_final = json.load(f)
+
+    # Build task list
+    tasks = []
+    skipped = []
+    for sw, task_ids in test_final.items():
+        for task_id in task_ids:
+            result_dir = os.path.join(args.results_dir, sw, task_id)
+            task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json")
+            reeval_out = os.path.join(result_dir, "reeval_result.json")
+
+            if not os.path.isdir(result_dir):
+                skipped.append(f"  NO result_dir:  {sw}/{task_id}")
+                continue
+            if not os.path.exists(task_json):
+                skipped.append(f"  NO task JSON:   {sw}/{task_id}")
+                continue
+            if not glob.glob(os.path.join(result_dir, "step_*.png")):
+                skipped.append(f"  NO screenshots: {sw}/{task_id}")
+                continue
+            if os.path.exists(reeval_out) and not args.force:
+                skipped.append(f"  already done:   {sw}/{task_id}")
+                continue
+
+            tasks.append((task_json, result_dir, sw, task_id))
+
+    print(f"Tasks to evaluate: {len(tasks)}")
+    if skipped:
+        print(f"Skipped ({len(skipped)}):")
+        for s in skipped:
+            print(s)
+    print()
+
+    ok, failed = 0, []
+    for i, (task_json, result_dir, sw, task_id) in enumerate(tasks):
+        print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}")
+        r = subprocess.run(
+            [sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir],
+            capture_output=True, text=True
+        )
+        if r.returncode != 0:
+            print(f"  ERROR: {r.stderr[-300:]}")
+            failed.append(f"{sw}/{task_id}")
+        else:
+            for line in r.stdout.splitlines():
+                if "normalized" in line or "Task complete" in line or "Score" in line:
+                    print(f"  {line.strip()}")
+            ok += 1
+
+    print(f"\nDone: {ok} succeeded, {len(failed)} failed")
+    if failed:
+        print("Failed tasks:", failed)
+
+
+if __name__ == "__main__":
+    main()
--- a/desktop_env/evaluators/metrics/vllm_eval.py
+++ b/desktop_env/evaluators/metrics/vllm_eval.py
@@ -347,8 +347,13 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = False, max_size
    filenames = []

    # Find all step screenshot files (e.g., step_1_20240101@120000.png)
+    # Sort numerically by step number to avoid lexicographic issues (step_10 < step_2 in string sort)
+    import re as _re_sort
    pattern = os.path.join(result_dir, "step_*.png")
-    screenshot_files = sorted(glob.glob(pattern))
+    screenshot_files = sorted(
+        glob.glob(pattern),
+        key=lambda p: int(_re_sort.search(r"step_(\d+)", os.path.basename(p)).group(1))
+    )

    if not screenshot_files:
        logger.warning(f"No screenshot files found in {result_dir}")
@@ -446,7 +451,7 @@ def vllm_eval(result_state, **options) -> float:
    metadata = options.get("metadata", {})

    params = {
-        "temperature": options.get("temperature", 0.7),
+        "temperature": options.get("temperature", 0.2),
        "max_tokens": options.get("max_tokens", 16384),
        "top_p": options.get("top_p", 1.0)
    }
@@ -493,51 +498,49 @@ IMPORTANT: Only reference screenshots from the list above. Do NOT reference any
    else:
        img_info = "\nNo screenshots were provided."

-    prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots.
+    final_name = screenshot_filenames[-1] if screenshot_filenames else "N/A"

-Task Instruction: {instruction}
+    prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
+
+Task: {instruction}
 {preconfig_section}
 {expected_steps_section}
 {img_info}

-Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context.
+════════════════════════════════════════════════════
+STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
+The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
+Look at this image carefully NOW before anything else. Ask yourself:
+  "Does this final screenshot show the task is complete?"
+Only after answering that, use earlier screenshots to understand HOW the agent got there.
+════════════════════════════════════════════════════

-CRITICAL SCORING RULES:
-1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points.
-2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress.
-3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0.
-4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress.
-5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal.
+SCORING RULES:
+1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
+2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
+3. Require VISIBLE evidence in a specific screenshot for each step.
+4. If the final screenshot shows the task is done, score high even if earlier steps were messy.

-SCORING GUIDE (0-10):
- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0.
- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal.
- 3-4: Some initial steps toward the task have been taken but the task is far from complete.
- 5-6: Significant progress - about half the required steps are completed with visible evidence.
- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues.
- 9: The task is essentially complete with very minor cosmetic differences.
- 10: The task is perfectly and completely finished with clear evidence in the final screenshot.
+SCORE GUIDE (0-10):
+- 0: No agent progress; only pre-configured state visible.
+- 1-3: Minor actions taken, far from goal.
+- 4-6: Meaningful progress, roughly half done.
+- 7-8: Most steps done, minor issues.
+- 9: Essentially complete, cosmetic differences only.
+- 10: Fully and perfectly complete with clear visual proof in the final screenshot.

-IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
+Respond with ONLY valid JSON (no extra text):

 {{
+  "final_screenshot": "{final_name}",
+  "final_screenshot_description": "Describe exactly what you see in {final_name}",
+  "task_complete_in_final": true/false,
  "steps_analysis": [
-    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}},
-    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}}
+    {{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
  ],
  "final_completion": "True/False",
  "score": 0-10
-}}
-
-Where:
- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps.
- "status": Either "Success" or "Fail" for each step
- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence
- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise
- "score": Integer from 0 to 10, following the strict scoring guide above
-
-Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER."""
+}}"""

    try:
        result = llm.generate_with_images(
--- a/evaluation_examples/examples/origin/Origin_User_Guide_2025b_E_task1.json
+++ b/evaluation_examples/examples/origin/Origin_User_Guide_2025b_E_task1.json
@@ -1,59 +0,0 @@
-{
-  "id": "Origin_User_Guide_2025b_E_task1",
-  "snapshot": "origin",
-  "instruction": "在 Origin 中通过 Data → Connect to File 导入一个本地 Excel 文件 example.xlsx",
-  "source": "custom",
-  "config": [
-    {
-      "type": "upload_file",
-      "parameters": {
-        "files": [
-          {
-            "local_path": "evaluation_examples/data/origin/example.xlsx",
-            "path": "C:\\Users\\user\\Desktop\\example.xlsx"
-          }
-        ]
-      }
-    },
-    {
-      "type": "launch",
-      "parameters": {
-        "command": [
-          "C:\\Program Files\\OriginLab\\Origin2025b\\Origin64.exe",
-          "C:\\Users\\user\\Desktop\\example.xlsx"
-        ]
-      }
-    },
-    {
-      "type": "sleep",
-      "parameters": {
-        "seconds": 5
-      }
-    }
-  ],
-  "trajectory": "trajectories/",
-  "related_apps": [
-    "origin"
-  ],
-  "evaluator": {
-    "postconfig": [
-      {
-        "type": "sleep",
-        "parameters": {
-          "seconds": 3
-        }
-      }
-    ],
-    "func": "vllm_eval"
-  },
-  "proxy": false,
-  "fixed_ip": false,
-  "possibility_of_env_change": "low",
-  "metadata": {
-    "input_files": [
-      "example.xlsx"
-    ],
-    "steps": "1. 单击顶部主菜单栏中的 \"Data\" 菜单。\n2. 在展开的下拉菜单中，将鼠标悬停或单击 \"Connect to File\" 菜单项以展开子菜单。\n3. 在展开的子菜单中，单击选中 \"Excel...\" 选项。\n4. 在弹出的文件选择对话框中，单击选中文件名输入框将光标定位至此。\n5. 在文件名输入框中，输入文字 \"example.xlsx\"。\n6. 单击对话框右下角的 \"Open\"（或\"打开\"）按钮。 \n7. 单击新弹出对话框中的 \"OK\" 按钮。",
-    "steps_original": "1. 在 Origin 的主菜单中选择 Data → Connect to File。\n2. 点击 Connect to File 菜单中的按钮。\n3. 选择文件 example.xlsx 并点击 Open。\n4. 数据将被加载到当前的工作表中。"
-  }
-}
--- a/evaluation_examples/test_final.json
+++ b/evaluation_examples/test_final.json
@@ -26,7 +26,7 @@
  ],
  "origin": [
    "Origin_User_Guide_2025b_E_task2",
-    "Origin_User_Guide_2025b_E_task3", 
+    "Origin_User_Guide_2025b_E_task3",
    "Origin_User_Guide_2025b_E_task4",
    "Origin_User_Guide_2025b_E_task5",
    "Origin_User_Guide_2025b_E_task8",
@@ -70,4 +70,4 @@
    "viewports_task10",
    "viewports_task11"
  ]
-}
+}
--- a/reeval.py
+++ b/reeval.py
@@ -0,0 +1,309 @@
+"""
+Re-evaluation script for vllm_eval tasks.
+
+Usage:
+    python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder
+    python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10
+"""
+
+import argparse
+import json
+import os
+import sys
+import glob
+import base64
+import re
+import logging
+from io import BytesIO
+from PIL import Image
+
+logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+logger = logging.getLogger("reeval")
+
+MAX_IMAGES = 10  # hard API limit
+
+# Defaults matching run_proxmox.sh
+_DEFAULT_API_KEY  = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17"
+_DEFAULT_BASE_URL = "https://vip.apiyi.com/v1"
+_DEFAULT_MODEL    = "gemini-3.1-pro-preview"
+
+
+# ── image helpers ──────────────────────────────────────────────────────────────
+
+def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str:
+    try:
+        img_data = base64.b64decode(img_b64)
+        img = Image.open(BytesIO(img_data))
+        if img.mode in ("RGBA", "LA", "P"):
+            bg = Image.new("RGB", img.size, (255, 255, 255))
+            if img.mode == "P":
+                img = img.convert("RGBA")
+            bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None)
+            img = bg
+        if max(img.size) > max_size:
+            ratio = max_size / max(img.size)
+            img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS)
+        buf = BytesIO()
+        img.save(buf, format="JPEG", quality=quality, optimize=True)
+        return base64.b64encode(buf.getvalue()).decode()
+    except Exception as e:
+        logger.warning(f"Compression failed: {e}")
+        return img_b64
+
+
+def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True):
+    """
+    Load up to max_images screenshots, always keeping FIRST and LAST,
+    sampling middle frames evenly. Returns (b64_list, name_list).
+    """
+    pattern = os.path.join(result_dir, "step_*.png")
+    files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1)))
+
+    if not files:
+        raise FileNotFoundError(f"No step_*.png found in {result_dir}")
+
+    n = len(files)
+    logger.info(f"Found {n} screenshots in {result_dir}")
+
+    if n <= max_images:
+        selected = files
+    else:
+        # Always keep first + last; fill rest with evenly-spaced middle frames
+        middle_slots = max_images - 2
+        step = (n - 2) / (middle_slots + 1)
+        indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1]
+        indices = sorted(set(indices))
+        selected = [files[i] for i in indices]
+
+    b64_list, name_list = [], []
+    for fp in selected:
+        with open(fp, "rb") as f:
+            b64 = base64.b64encode(f.read()).decode()
+        if compress:
+            b64 = _compress_image(b64)
+        b64_list.append(b64)
+        m = re.match(r"(step_\d+)", os.path.basename(fp))
+        name_list.append(m.group(1) if m else os.path.basename(fp))
+
+    logger.info(f"Selected {len(name_list)} frames: {name_list}")
+    return b64_list, name_list
+
+
+# ── LLM call ──────────────────────────────────────────────────────────────────
+
+def _call_llm(model: str, prompt: str, images_b64: list) -> str:
+    from dotenv import load_dotenv
+    load_dotenv()
+
+    api_key  = os.getenv("OPENAI_API_KEY",  _DEFAULT_API_KEY)
+    base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL)
+
+    # gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com)
+    if model.startswith("gpt") or model.startswith("gemini"):
+        from openai import OpenAI
+        client = OpenAI(api_key=api_key, base_url=base_url)
+        content = [{"type": "text", "text": prompt}]
+        for b64 in images_b64:
+            content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}})
+        resp = client.chat.completions.create(
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            temperature=0.2,
+            max_tokens=4096,
+        )
+        return resp.choices[0].message.content
+
+    elif model.startswith("claude"):
+        from anthropic import Anthropic
+        client = Anthropic(
+            base_url=os.getenv("ANTHROPIC_BASE_URL"),
+            api_key=os.getenv("ANTHROPIC_API_KEY"),
+        )
+        content = [{"type": "text", "text": prompt}]
+        for b64 in images_b64:
+            content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}})
+        resp = client.messages.create(
+            model=model,
+            messages=[{"role": "user", "content": content}],
+            temperature=0.2,
+            max_tokens=4096,
+        )
+        return resp.content[0].text
+
+    else:
+        raise ValueError(f"Unknown model prefix: {model}")
+
+
+# ── prompt builder ─────────────────────────────────────────────────────────────
+
+def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str:
+    # Pre-config section
+    preconfig_lines = []
+    for cfg in config:
+        if cfg.get("type") == "launch":
+            cmds = cfg.get("parameters", {}).get("command", [])
+            if cmds:
+                preconfig_lines.append(f"  - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).")
+        elif cfg.get("type") == "open":
+            path = cfg.get("parameters", {}).get("path", "")
+            preconfig_lines.append(f"  - '{path}' was auto-opened (NOT agent's work).")
+    preconfig_section = (
+        "PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" +
+        "\n".join(preconfig_lines)
+    ) if preconfig_lines else ""
+
+    expected_steps = metadata.get("steps", "")
+    expected_section = (
+        f"EXPECTED STEPS (reference only):\n{expected_steps}"
+    ) if expected_steps else ""
+
+    final_name = name_list[-1]
+    img_list_str = ", ".join(name_list)
+
+    prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks.
+
+Task: {instruction}
+
+{preconfig_section}
+
+{expected_section}
+
+You are provided with {len(name_list)} screenshots in chronological order: {img_list_str}
+
+════════════════════════════════════════════════════
+STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST
+The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session.
+Look at this image carefully NOW before anything else. Ask yourself:
+  "Does this final screenshot show the task is complete?"
+Only after answering that, use earlier screenshots to understand HOW the agent got there.
+════════════════════════════════════════════════════
+
+SCORING RULES:
+1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image).
+2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup).
+3. Require VISIBLE evidence in a specific screenshot for each step.
+4. If the final screenshot shows the task is done, score high even if earlier steps were messy.
+
+SCORE GUIDE (0–10):
+- 0: No agent progress; only pre-configured state visible.
+- 1–3: Minor actions taken, far from goal.
+- 4–6: Meaningful progress, roughly half done.
+- 7–8: Most steps done, minor issues.
+- 9: Essentially complete, cosmetic differences only.
+- 10: Fully and perfectly complete with clear visual proof in the final screenshot.
+
+Respond with ONLY valid JSON (no extra text):
+
+{{
+  "final_screenshot": "{final_name}",
+  "final_screenshot_description": "Describe exactly what you see in {final_name}",
+  "task_complete_in_final": true/false,
+  "steps_analysis": [
+    {{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}}
+  ],
+  "final_completion": "True/False",
+  "score": 0-10
+}}"""
+    return prompt
+
+
+# ── parse response ─────────────────────────────────────────────────────────────
+
+def _parse_response(text: str) -> dict:
+    text = text.strip()
+    m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
+    if m:
+        text = m.group(1)
+    else:
+        text = re.sub(r'^```(?:json)?\s*', '', text)
+        text = re.sub(r'\s*```$', '', text)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        # try extracting bare JSON object
+        m2 = re.search(r'\{.*\}', text, re.DOTALL)
+        if m2:
+            try:
+                return json.loads(m2.group(0))
+            except Exception:
+                pass
+        logger.error(f"Could not parse JSON from:\n{text[:500]}")
+        return {"score": 0, "final_completion": "False", "steps_analysis": []}
+
+
+# ── main ──────────────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.")
+    parser.add_argument("--task", required=True, help="Path to task JSON file")
+    parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots")
+    parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})")
+    parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)")
+    parser.add_argument("--no_compress", action="store_true", help="Disable image compression")
+    parser.add_argument("--output", default=None, help="Output JSON path (default: <result_dir>/reeval_result.json)")
+    args = parser.parse_args()
+
+    # Load task
+    with open(args.task, "r", encoding="utf-8") as f:
+        task = json.load(f)
+
+    instruction = task.get("instruction", "")
+    config = task.get("config", [])
+    metadata = task.get("metadata", {})
+
+    logger.info(f"Task: {task.get('id', '?')}")
+    logger.info(f"Instruction: {instruction}")
+
+    # Load screenshots
+    images_b64, name_list = _load_screenshots(
+        args.result_dir,
+        max_images=args.max_images,
+        compress=not args.no_compress,
+    )
+
+    # Build prompt
+    prompt = _build_prompt(instruction, config, metadata, name_list)
+
+    # Call LLM
+    logger.info(f"Calling {args.model} with {len(images_b64)} images...")
+    raw = _call_llm(args.model, prompt, images_b64)
+    logger.info(f"Raw response:\n{raw}")
+
+    # Parse
+    result = _parse_response(raw)
+    score_raw = float(result.get("score", 0))
+    score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2)
+
+    # Add metadata to output
+    result["_task_id"] = task.get("id", "")
+    result["_model"] = args.model
+    result["_frames_used"] = name_list
+    result["_score_normalized"] = score_norm
+
+    # Save
+    out_path = args.output or os.path.join(args.result_dir, "reeval_result.json")
+    with open(out_path, "w", encoding="utf-8") as f:
+        json.dump(result, f, ensure_ascii=False, indent=2)
+
+    # Print summary
+    print("\n" + "="*60)
+    print(f"Task:             {task.get('id', '?')}")
+    print(f"Frames used:      {name_list}")
+    print(f"Final screenshot: {name_list[-1]}")
+    if "final_screenshot_description" in result:
+        print(f"Final state desc: {result['final_screenshot_description']}")
+    print(f"Task complete:    {result.get('task_complete_in_final', '?')}")
+    print(f"final_completion: {result.get('final_completion', '?')}")
+    print(f"Score (0-10):     {score_raw}  →  normalized: {score_norm}")
+    print(f"Result saved to:  {out_path}")
+    print("="*60 + "\n")
+
+    if result.get("steps_analysis"):
+        print("Steps analysis:")
+        for s in result["steps_analysis"]:
+            status_icon = "✓" if s.get("status") == "Success" else "✗"
+            print(f"  [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}")
+
+
+if __name__ == "__main__":
+    main()