diff --git a/batch_reeval.py b/batch_reeval.py new file mode 100644 index 0000000..f435e66 --- /dev/null +++ b/batch_reeval.py @@ -0,0 +1,87 @@ +""" +Batch re-evaluation based on test_final.json. + +Usage: + python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot/gpt-5.4 + python3 batch_reeval.py --results_dir /Volumes/Castor/课题/results_baseline_50steps/pyautogui/screenshot_a11y_tree/gpt-5.4 + python3 batch_reeval.py --results_dir --force # re-run even if already done +""" + +import argparse +import json +import os +import subprocess +import sys +import glob + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +TEST_FINAL = os.path.join(SCRIPT_DIR, "evaluation_examples", "test_final.json") +TASK_CONFIG_DIR = os.path.join(SCRIPT_DIR, "evaluation_examples", "examples") +REEVAL = os.path.join(SCRIPT_DIR, "reeval.py") + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--results_dir", required=True, + help="e.g. /Volumes/.../screenshot/gpt-5.4") + parser.add_argument("--force", action="store_true", + help="Re-evaluate even if reeval_result.json already exists") + args = parser.parse_args() + + with open(TEST_FINAL) as f: + test_final = json.load(f) + + # Build task list + tasks = [] + skipped = [] + for sw, task_ids in test_final.items(): + for task_id in task_ids: + result_dir = os.path.join(args.results_dir, sw, task_id) + task_json = os.path.join(TASK_CONFIG_DIR, sw, f"{task_id}.json") + reeval_out = os.path.join(result_dir, "reeval_result.json") + + if not os.path.isdir(result_dir): + skipped.append(f" NO result_dir: {sw}/{task_id}") + continue + if not os.path.exists(task_json): + skipped.append(f" NO task JSON: {sw}/{task_id}") + continue + if not glob.glob(os.path.join(result_dir, "step_*.png")): + skipped.append(f" NO screenshots: {sw}/{task_id}") + continue + if os.path.exists(reeval_out) and not args.force: + skipped.append(f" already done: {sw}/{task_id}") + continue + + tasks.append((task_json, result_dir, sw, task_id)) + + print(f"Tasks to evaluate: {len(tasks)}") + if skipped: + print(f"Skipped ({len(skipped)}):") + for s in skipped: + print(s) + print() + + ok, failed = 0, [] + for i, (task_json, result_dir, sw, task_id) in enumerate(tasks): + print(f"[{i+1}/{len(tasks)}] {sw}/{task_id}") + r = subprocess.run( + [sys.executable, REEVAL, "--task", task_json, "--result_dir", result_dir], + capture_output=True, text=True + ) + if r.returncode != 0: + print(f" ERROR: {r.stderr[-300:]}") + failed.append(f"{sw}/{task_id}") + else: + for line in r.stdout.splitlines(): + if "normalized" in line or "Task complete" in line or "Score" in line: + print(f" {line.strip()}") + ok += 1 + + print(f"\nDone: {ok} succeeded, {len(failed)} failed") + if failed: + print("Failed tasks:", failed) + + +if __name__ == "__main__": + main() diff --git a/desktop_env/evaluators/metrics/vllm_eval.py b/desktop_env/evaluators/metrics/vllm_eval.py index 21346a2..5df5aa9 100644 --- a/desktop_env/evaluators/metrics/vllm_eval.py +++ b/desktop_env/evaluators/metrics/vllm_eval.py @@ -347,8 +347,13 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = False, max_size filenames = [] # Find all step screenshot files (e.g., step_1_20240101@120000.png) + # Sort numerically by step number to avoid lexicographic issues (step_10 < step_2 in string sort) + import re as _re_sort pattern = os.path.join(result_dir, "step_*.png") - screenshot_files = sorted(glob.glob(pattern)) + screenshot_files = sorted( + glob.glob(pattern), + key=lambda p: int(_re_sort.search(r"step_(\d+)", os.path.basename(p)).group(1)) + ) if not screenshot_files: logger.warning(f"No screenshot files found in {result_dir}") @@ -446,7 +451,7 @@ def vllm_eval(result_state, **options) -> float: metadata = options.get("metadata", {}) params = { - "temperature": options.get("temperature", 0.7), + "temperature": options.get("temperature", 0.2), "max_tokens": options.get("max_tokens", 16384), "top_p": options.get("top_p", 1.0) } @@ -493,51 +498,49 @@ IMPORTANT: Only reference screenshots from the list above. Do NOT reference any else: img_info = "\nNo screenshots were provided." - prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots. + final_name = screenshot_filenames[-1] if screenshot_filenames else "N/A" -Task Instruction: {instruction} + prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks. + +Task: {instruction} {preconfig_section} {expected_steps_section} {img_info} -Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context. +════════════════════════════════════════════════════ +STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST +The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session. +Look at this image carefully NOW before anything else. Ask yourself: + "Does this final screenshot show the task is complete?" +Only after answering that, use earlier screenshots to understand HOW the agent got there. +════════════════════════════════════════════════════ -CRITICAL SCORING RULES: -1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points. -2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress. -3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0. -4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress. -5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal. +SCORING RULES: +1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image). +2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup). +3. Require VISIBLE evidence in a specific screenshot for each step. +4. If the final screenshot shows the task is done, score high even if earlier steps were messy. -SCORING GUIDE (0-10): -- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0. -- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal. -- 3-4: Some initial steps toward the task have been taken but the task is far from complete. -- 5-6: Significant progress - about half the required steps are completed with visible evidence. -- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues. -- 9: The task is essentially complete with very minor cosmetic differences. -- 10: The task is perfectly and completely finished with clear evidence in the final screenshot. +SCORE GUIDE (0-10): +- 0: No agent progress; only pre-configured state visible. +- 1-3: Minor actions taken, far from goal. +- 4-6: Meaningful progress, roughly half done. +- 7-8: Most steps done, minor issues. +- 9: Essentially complete, cosmetic differences only. +- 10: Fully and perfectly complete with clear visual proof in the final screenshot. -IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format: +Respond with ONLY valid JSON (no extra text): {{ + "final_screenshot": "{final_name}", + "final_screenshot_description": "Describe exactly what you see in {final_name}", + "task_complete_in_final": true/false, "steps_analysis": [ - {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}}, - {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}} + {{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}} ], "final_completion": "True/False", "score": 0-10 -}} - -Where: -- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps. -- "status": Either "Success" or "Fail" for each step -- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png") -- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence -- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise -- "score": Integer from 0 to 10, following the strict scoring guide above - -Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER.""" +}}""" try: result = llm.generate_with_images( diff --git a/evaluation_examples/examples/origin/Origin_User_Guide_2025b_E_task1.json b/evaluation_examples/examples/origin/Origin_User_Guide_2025b_E_task1.json deleted file mode 100644 index a04e242..0000000 --- a/evaluation_examples/examples/origin/Origin_User_Guide_2025b_E_task1.json +++ /dev/null @@ -1,59 +0,0 @@ -{ - "id": "Origin_User_Guide_2025b_E_task1", - "snapshot": "origin", - "instruction": "在 Origin 中通过 Data → Connect to File 导入一个本地 Excel 文件 example.xlsx", - "source": "custom", - "config": [ - { - "type": "upload_file", - "parameters": { - "files": [ - { - "local_path": "evaluation_examples/data/origin/example.xlsx", - "path": "C:\\Users\\user\\Desktop\\example.xlsx" - } - ] - } - }, - { - "type": "launch", - "parameters": { - "command": [ - "C:\\Program Files\\OriginLab\\Origin2025b\\Origin64.exe", - "C:\\Users\\user\\Desktop\\example.xlsx" - ] - } - }, - { - "type": "sleep", - "parameters": { - "seconds": 5 - } - } - ], - "trajectory": "trajectories/", - "related_apps": [ - "origin" - ], - "evaluator": { - "postconfig": [ - { - "type": "sleep", - "parameters": { - "seconds": 3 - } - } - ], - "func": "vllm_eval" - }, - "proxy": false, - "fixed_ip": false, - "possibility_of_env_change": "low", - "metadata": { - "input_files": [ - "example.xlsx" - ], - "steps": "1. 单击顶部主菜单栏中的 \"Data\" 菜单。\n2. 在展开的下拉菜单中,将鼠标悬停或单击 \"Connect to File\" 菜单项以展开子菜单。\n3. 在展开的子菜单中,单击选中 \"Excel...\" 选项。\n4. 在弹出的文件选择对话框中,单击选中文件名输入框将光标定位至此。\n5. 在文件名输入框中,输入文字 \"example.xlsx\"。\n6. 单击对话框右下角的 \"Open\"(或\"打开\")按钮。 \n7. 单击新弹出对话框中的 \"OK\" 按钮。", - "steps_original": "1. 在 Origin 的主菜单中选择 Data → Connect to File。\n2. 点击 Connect to File 菜单中的按钮。\n3. 选择文件 example.xlsx 并点击 Open。\n4. 数据将被加载到当前的工作表中。" - } -} \ No newline at end of file diff --git a/evaluation_examples/test_final.json b/evaluation_examples/test_final.json index 368ab58..452f05b 100644 --- a/evaluation_examples/test_final.json +++ b/evaluation_examples/test_final.json @@ -26,7 +26,7 @@ ], "origin": [ "Origin_User_Guide_2025b_E_task2", - "Origin_User_Guide_2025b_E_task3", + "Origin_User_Guide_2025b_E_task3", "Origin_User_Guide_2025b_E_task4", "Origin_User_Guide_2025b_E_task5", "Origin_User_Guide_2025b_E_task8", @@ -70,4 +70,4 @@ "viewports_task10", "viewports_task11" ] -} \ No newline at end of file +} diff --git a/reeval.py b/reeval.py new file mode 100644 index 0000000..91cd9e4 --- /dev/null +++ b/reeval.py @@ -0,0 +1,309 @@ +""" +Re-evaluation script for vllm_eval tasks. + +Usage: + python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder + python reeval.py --task path/to/task.json --result_dir path/to/screenshot/folder --model gpt-5.4 --max_images 10 +""" + +import argparse +import json +import os +import sys +import glob +import base64 +import re +import logging +from io import BytesIO +from PIL import Image + +logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") +logger = logging.getLogger("reeval") + +MAX_IMAGES = 10 # hard API limit + +# Defaults matching run_proxmox.sh +_DEFAULT_API_KEY = "sk-EQGuvk0rS7EG4Cu22cF6D5Cc3a324c88B2E2D432Bc59Bb17" +_DEFAULT_BASE_URL = "https://vip.apiyi.com/v1" +_DEFAULT_MODEL = "gemini-3.1-pro-preview" + + +# ── image helpers ────────────────────────────────────────────────────────────── + +def _compress_image(img_b64: str, max_size: int = 1024, quality: int = 85) -> str: + try: + img_data = base64.b64decode(img_b64) + img = Image.open(BytesIO(img_data)) + if img.mode in ("RGBA", "LA", "P"): + bg = Image.new("RGB", img.size, (255, 255, 255)) + if img.mode == "P": + img = img.convert("RGBA") + bg.paste(img, mask=img.split()[-1] if img.mode in ("RGBA", "LA") else None) + img = bg + if max(img.size) > max_size: + ratio = max_size / max(img.size) + img = img.resize(tuple(int(d * ratio) for d in img.size), Image.Resampling.LANCZOS) + buf = BytesIO() + img.save(buf, format="JPEG", quality=quality, optimize=True) + return base64.b64encode(buf.getvalue()).decode() + except Exception as e: + logger.warning(f"Compression failed: {e}") + return img_b64 + + +def _load_screenshots(result_dir: str, max_images: int = MAX_IMAGES, compress: bool = True): + """ + Load up to max_images screenshots, always keeping FIRST and LAST, + sampling middle frames evenly. Returns (b64_list, name_list). + """ + pattern = os.path.join(result_dir, "step_*.png") + files = sorted(glob.glob(pattern), key=lambda p: int(re.search(r"step_(\d+)", p).group(1))) + + if not files: + raise FileNotFoundError(f"No step_*.png found in {result_dir}") + + n = len(files) + logger.info(f"Found {n} screenshots in {result_dir}") + + if n <= max_images: + selected = files + else: + # Always keep first + last; fill rest with evenly-spaced middle frames + middle_slots = max_images - 2 + step = (n - 2) / (middle_slots + 1) + indices = [0] + [int(round(i * step)) for i in range(1, middle_slots + 1)] + [n - 1] + indices = sorted(set(indices)) + selected = [files[i] for i in indices] + + b64_list, name_list = [], [] + for fp in selected: + with open(fp, "rb") as f: + b64 = base64.b64encode(f.read()).decode() + if compress: + b64 = _compress_image(b64) + b64_list.append(b64) + m = re.match(r"(step_\d+)", os.path.basename(fp)) + name_list.append(m.group(1) if m else os.path.basename(fp)) + + logger.info(f"Selected {len(name_list)} frames: {name_list}") + return b64_list, name_list + + +# ── LLM call ────────────────────────────────────────────────────────────────── + +def _call_llm(model: str, prompt: str, images_b64: list) -> str: + from dotenv import load_dotenv + load_dotenv() + + api_key = os.getenv("OPENAI_API_KEY", _DEFAULT_API_KEY) + base_url = os.getenv("OPENAI_BASE_URL", _DEFAULT_BASE_URL) + + # gpt-* and gemini-* both go through the OpenAI-compatible proxy (vip.apiyi.com) + if model.startswith("gpt") or model.startswith("gemini"): + from openai import OpenAI + client = OpenAI(api_key=api_key, base_url=base_url) + content = [{"type": "text", "text": prompt}] + for b64 in images_b64: + content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{b64}"}}) + resp = client.chat.completions.create( + model=model, + messages=[{"role": "user", "content": content}], + temperature=0.2, + max_tokens=4096, + ) + return resp.choices[0].message.content + + elif model.startswith("claude"): + from anthropic import Anthropic + client = Anthropic( + base_url=os.getenv("ANTHROPIC_BASE_URL"), + api_key=os.getenv("ANTHROPIC_API_KEY"), + ) + content = [{"type": "text", "text": prompt}] + for b64 in images_b64: + content.append({"type": "image", "source": {"type": "base64", "media_type": "image/jpeg", "data": b64}}) + resp = client.messages.create( + model=model, + messages=[{"role": "user", "content": content}], + temperature=0.2, + max_tokens=4096, + ) + return resp.content[0].text + + else: + raise ValueError(f"Unknown model prefix: {model}") + + +# ── prompt builder ───────────────────────────────────────────────────────────── + +def _build_prompt(instruction: str, config: list, metadata: dict, name_list: list) -> str: + # Pre-config section + preconfig_lines = [] + for cfg in config: + if cfg.get("type") == "launch": + cmds = cfg.get("parameters", {}).get("command", []) + if cmds: + preconfig_lines.append(f" - '{os.path.basename(cmds[0])}' was auto-launched (NOT agent's work).") + elif cfg.get("type") == "open": + path = cfg.get("parameters", {}).get("path", "") + preconfig_lines.append(f" - '{path}' was auto-opened (NOT agent's work).") + preconfig_section = ( + "PRE-CONFIGURED ENVIRONMENT (done BEFORE agent started — do NOT credit these):\n" + + "\n".join(preconfig_lines) + ) if preconfig_lines else "" + + expected_steps = metadata.get("steps", "") + expected_section = ( + f"EXPECTED STEPS (reference only):\n{expected_steps}" + ) if expected_steps else "" + + final_name = name_list[-1] + img_list_str = ", ".join(name_list) + + prompt = f"""You are a STRICT evaluator for desktop GUI agent tasks. + +Task: {instruction} + +{preconfig_section} + +{expected_section} + +You are provided with {len(name_list)} screenshots in chronological order: {img_list_str} + +════════════════════════════════════════════════════ +STEP 1 — EXAMINE THE FINAL SCREENSHOT FIRST +The LAST image provided is "{final_name}" — this is the FINAL STATE of the agent's session. +Look at this image carefully NOW before anything else. Ask yourself: + "Does this final screenshot show the task is complete?" +Only after answering that, use earlier screenshots to understand HOW the agent got there. +════════════════════════════════════════════════════ + +SCORING RULES: +1. Base your final_completion and score PRIMARILY on "{final_name}" (the last image). +2. Credit ONLY actions performed BY THE AGENT (not pre-configured setup). +3. Require VISIBLE evidence in a specific screenshot for each step. +4. If the final screenshot shows the task is done, score high even if earlier steps were messy. + +SCORE GUIDE (0–10): +- 0: No agent progress; only pre-configured state visible. +- 1–3: Minor actions taken, far from goal. +- 4–6: Meaningful progress, roughly half done. +- 7–8: Most steps done, minor issues. +- 9: Essentially complete, cosmetic differences only. +- 10: Fully and perfectly complete with clear visual proof in the final screenshot. + +Respond with ONLY valid JSON (no extra text): + +{{ + "final_screenshot": "{final_name}", + "final_screenshot_description": "Describe exactly what you see in {final_name}", + "task_complete_in_final": true/false, + "steps_analysis": [ + {{"step": "...", "status": "Success/Fail", "evidence_img": "step_X", "reason": "..."}} + ], + "final_completion": "True/False", + "score": 0-10 +}}""" + return prompt + + +# ── parse response ───────────────────────────────────────────────────────────── + +def _parse_response(text: str) -> dict: + text = text.strip() + m = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) + if m: + text = m.group(1) + else: + text = re.sub(r'^```(?:json)?\s*', '', text) + text = re.sub(r'\s*```$', '', text) + try: + return json.loads(text) + except json.JSONDecodeError: + # try extracting bare JSON object + m2 = re.search(r'\{.*\}', text, re.DOTALL) + if m2: + try: + return json.loads(m2.group(0)) + except Exception: + pass + logger.error(f"Could not parse JSON from:\n{text[:500]}") + return {"score": 0, "final_completion": "False", "steps_analysis": []} + + +# ── main ────────────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Re-evaluate a vllm_eval task from screenshots.") + parser.add_argument("--task", required=True, help="Path to task JSON file") + parser.add_argument("--result_dir", required=True, help="Directory containing step_*.png screenshots") + parser.add_argument("--model", default=_DEFAULT_MODEL, help=f"Eval model (default: {_DEFAULT_MODEL})") + parser.add_argument("--max_images", type=int, default=MAX_IMAGES, help="Max screenshots to send (default: 10)") + parser.add_argument("--no_compress", action="store_true", help="Disable image compression") + parser.add_argument("--output", default=None, help="Output JSON path (default: /reeval_result.json)") + args = parser.parse_args() + + # Load task + with open(args.task, "r", encoding="utf-8") as f: + task = json.load(f) + + instruction = task.get("instruction", "") + config = task.get("config", []) + metadata = task.get("metadata", {}) + + logger.info(f"Task: {task.get('id', '?')}") + logger.info(f"Instruction: {instruction}") + + # Load screenshots + images_b64, name_list = _load_screenshots( + args.result_dir, + max_images=args.max_images, + compress=not args.no_compress, + ) + + # Build prompt + prompt = _build_prompt(instruction, config, metadata, name_list) + + # Call LLM + logger.info(f"Calling {args.model} with {len(images_b64)} images...") + raw = _call_llm(args.model, prompt, images_b64) + logger.info(f"Raw response:\n{raw}") + + # Parse + result = _parse_response(raw) + score_raw = float(result.get("score", 0)) + score_norm = round(max(0.0, min(10.0, score_raw)) / 10.0, 2) + + # Add metadata to output + result["_task_id"] = task.get("id", "") + result["_model"] = args.model + result["_frames_used"] = name_list + result["_score_normalized"] = score_norm + + # Save + out_path = args.output or os.path.join(args.result_dir, "reeval_result.json") + with open(out_path, "w", encoding="utf-8") as f: + json.dump(result, f, ensure_ascii=False, indent=2) + + # Print summary + print("\n" + "="*60) + print(f"Task: {task.get('id', '?')}") + print(f"Frames used: {name_list}") + print(f"Final screenshot: {name_list[-1]}") + if "final_screenshot_description" in result: + print(f"Final state desc: {result['final_screenshot_description']}") + print(f"Task complete: {result.get('task_complete_in_final', '?')}") + print(f"final_completion: {result.get('final_completion', '?')}") + print(f"Score (0-10): {score_raw} → normalized: {score_norm}") + print(f"Result saved to: {out_path}") + print("="*60 + "\n") + + if result.get("steps_analysis"): + print("Steps analysis:") + for s in result["steps_analysis"]: + status_icon = "✓" if s.get("status") == "Success" else "✗" + print(f" [{status_icon}] {s.get('step','')} | evidence: {s.get('evidence_img','')} | {s.get('reason','')}") + + +if __name__ == "__main__": + main()