""" Re-evaluation script for vllm_eval tasks. Usage: python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10 """ import argparse import json import os import sys import glob import base64 import re import logging from io import BytesIO from PIL import Image logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") logger = logging.getLogger("reeval") MAX_IMAGES = 10 # hard API limit # Defaults matching run_proxmox.sh _DEFAULT_API_KEY = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" _DEFAULT_BASE_URL = "https://vip.apiyi.com/v1" _DEFAULT_MODEL = "gemini-3.1-pro-preview" # ── image helpers ────────────────────────────────────────────────────────────── def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str: try: img_data = base64.b64decode(img_b64) img = Image.open(BytesIO(img_data)) if img.mode in ("RGBA", "LA", "P"): bg = Image.new("RGB", img.size, (255, 255, 255)) if img.mode == "P": img = img.convert("RGBA") bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None) img = bg if max(img.size) > max_size: ratio = max_size / max(img.size) img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS) buf = BytesIO() img.save(buf, format="JPEG", quality=quality, optimize=True) return base64.b64encode(buf.getvalue()).decode() except Exception as e: logger.warning(f"Compression failed: {e}") return img_b64 def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True): """ Load up to max_images screenshots, always keeping FIRST and LAST, sampling middle frames evenly. Returns (b64_list, name_list). """ pattern = os.path.join(result_dir, "step_*.png") files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1))) if not files: raise FileNotFoundError(f"No step_*.png found in {result_dir}") n = len(files) logger.info(f"Found {n} screenshots in {result_dir}") if n <= max_images: selected = files else: # Always keep first + last; fill rest with evenly-spaced middle frames middle_slots = max_images - 2 step = (n - 2) / (middle_slots + 1) indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1] indices = sorted(set(indices)) selected = [files[i] for i in indices] b64_list, name_list = [], [] for fp in selected: with open(fp, "rb") as f: b64 = base64.b64encode(f.read()).decode() if compress: b64 = _compress_image(b64) b64_list.append(b64) m = re.match(r"(step_\d+)", os.path.basename(fp)) name_list.append(m.group(1) if m else os.path.basename(fp)) logger.info(f"Selected {len(name_list)} frames: {name_list}") return b64_list, name_list # ── LLM call ────────────────────────────────────────────────────────────────── def _call_llm(model: str, prompt: str, images_b64: list) -> str: from dotenv import load_dotenv load_dotenv() api_key = os.getenv("OPENAI_API_KEY", _DEFAULT_API_KEY) base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL) # gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com) if model.startswith("gpt") or model.startswith("gemini"): from openai import OpenAI client = OpenAI(api_key=api_key, base_url=base_url) content = [{"type": "text", "text": prompt}] for b64 in images_b64: content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}) resp = client.chat.completions.create( model=model, messages=[{"role": "user", "content": content}], temperature=0.2, max_tokens=4096, ) return resp.choices[0].message.content elif model.startswith("claude"): from anthropic import Anthropic client = Anthropic( base_url=os.getenv("ANTHROPIC_BASE_URL"), api_key=os.getenv("ANTHROPIC_API_KEY"), ) content = [{"type": "text", "text": prompt}] for b64 in images_b64: content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}}) resp = client.messages.create( model=model, messages=[{"role": "user", "content": content}], temperature=0.2, max_tokens=4096, ) return resp.content[0].text else: raise ValueError(f"Unknown model prefix: {model}") # ── prompt builder ───────────────────────────────────────────────────────────── def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str: # Pre-config section preconfig_lines = [] for cfg in config: if cfg.get("type") == "launch": cmds = cfg.get("parameters", {}).get("command", []) if cmds: preconfig_lines.append(f" - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).") elif cfg.get("type") == "open": path = cfg.get("parameters", {}).get("path", "") preconfig_lines.append(f" - '{path}' was auto-opened (NOT agent's work).") preconfig_section = ( "PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" + "\n".join(preconfig_lines) ) if preconfig_lines else "" expected_steps = metadata.get("steps", "") expected_section = ( f"EXPECTED STEPS (reference only):\n{expected_steps}" ) if expected_steps else "" final_name = name_list[-1] img_list_str = ", ".join(name_list) prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks. Task: {instruction} {preconfig_section} {expected_section} You are provided with {len(name_list)} screenshots in chronological order: {img_list_str} ════════════════════════════════════════════════════ STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session. Look at this image carefully NOW before anything else. Ask yourself: "Does this final screenshot show the task is complete?" Only after answering that, use earlier screenshots to understand HOW the agent got there. ════════════════════════════════════════════════════ SCORING RULES: 1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image). 2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup). 3. Require VISIBLE evidence in a specific screenshot for each step. 4. If the final screenshot shows the task is done, score high even if earlier steps were messy. SCORE GUIDE (0–10): - 0: No agent progress; only pre-configured state visible. - 1–3: Minor actions taken, far from goal. - 4–6: Meaningful progress, roughly half done. - 7–8: Most steps done, minor issues. - 9: Essentially complete, cosmetic differences only. - 10: Fully and perfectly complete with clear visual proof in the final screenshot. Respond with ONLY valid JSON (no extra text): {{ "final_screenshot": "{final_name}", "final_screenshot_description": "Describe exactly what you see in {final_name}", "task_complete_in_final": true/false, "steps_analysis": [ {{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}} ], "final_completion": "True/False", "score": 0-10 }}""" return prompt # ── parse response ───────────────────────────────────────────────────────────── def _parse_response(text: str) -> dict: text = text.strip() m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) if m: text = m.group(1) else: text = re.sub(r'^```(?:json)?\s*', '', text) text = re.sub(r'\s*```$', '', text) try: return json.loads(text) except json.JSONDecodeError: # try extracting bare JSON object m2 = re.search(r'\{.*\}', text, re.DOTALL) if m2: try: return json.loads(m2.group(0)) except Exception: pass logger.error(f"Could not parse JSON from:\n{text[:500]}") return {"score": 0, "final_completion": "False", "steps_analysis": []} # ── main ────────────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.") parser.add_argument("--task", required=True, help="Path to task JSON file") parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots") parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})") parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)") parser.add_argument("--no_compress", action="store_true", help="Disable image compression") parser.add_argument("--output", default=None, help="Output JSON path (default: /reeval_result.json)") args = parser.parse_args() # Load task with open(args.task, "r", encoding="utf-8") as f: task = json.load(f) instruction = task.get("instruction", "") config = task.get("config", []) metadata = task.get("metadata", {}) logger.info(f"Task: {task.get('id', '?')}") logger.info(f"Instruction: {instruction}") # Load screenshots images_b64, name_list = _load_screenshots( args.result_dir, max_images=args.max_images, compress=not args.no_compress, ) # Build prompt prompt = _build_prompt(instruction, config, metadata, name_list) # Call LLM logger.info(f"Calling {args.model} with {len(images_b64)} images...") raw = _call_llm(args.model, prompt, images_b64) logger.info(f"Raw response:\n{raw}") # Parse result = _parse_response(raw) score_raw = float(result.get("score", 0)) score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2) # Add metadata to output result["_task_id"] = task.get("id", "") result["_model"] = args.model result["_frames_used"] = name_list result["_score_normalized"] = score_norm # Save out_path = args.output or os.path.join(args.result_dir, "reeval_result.json") with open(out_path, "w", encoding="utf-8") as f: json.dump(result, f, ensure_ascii=False, indent=2) # Print summary print("\n" + "="*60) print(f"Task: {task.get('id', '?')}") print(f"Frames used: {name_list}") print(f"Final screenshot: {name_list[-1]}") if "final_screenshot_description" in result: print(f"Final state desc: {result['final_screenshot_description']}") print(f"Task complete: {result.get('task_complete_in_final', '?')}") print(f"final_completion: {result.get('final_completion', '?')}") print(f"Score (0-10): {score_raw} → normalized: {score_norm}") print(f"Result saved to: {out_path}") print("="*60 + "\n") if result.get("steps_analysis"): print("Steps analysis:") for s in result["steps_analysis"]: status_icon = "✓" if s.get("status") == "Success" else "✗" print(f" [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}") if __name__ == "__main__": main()