feat: 新增科研软件 benchmark 任务数据

- 新增 avogadro/imagej/jade/origin/ovito/pymol/vesta 等科研软件任务 JSON - 修改 vllm_eval.py，修改图片文件名称为第x步 - desktop_env.py 添加额外数据参数 config 和 metadata
2026-02-25 15:19:36 +08:00
parent 613f55f0da
commit 9899d4a0c7
85 changed files with 4703 additions and 71 deletions
--- a/desktop_env/evaluators/metrics/vllm_eval.py
+++ b/desktop_env/evaluators/metrics/vllm_eval.py
@@ -287,7 +287,7 @@ class UnifiedLLM:
            raise ValueError(f"Unsupported provider: {self.provider}")


-def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> List[str]:
+def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> tuple:
    """
    Load all step screenshots from result directory and convert to base64

@@ -298,9 +298,10 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
        quality: JPEG quality for compression (default: 85)

    Returns:
-        List of base64 encoded screenshot strings
+        Tuple of (list of base64 encoded screenshot strings, list of short filenames like 'step_1', 'step_2', ...)
    """
    screenshots = []
+    filenames = []

    # Find all step screenshot files (e.g., step_1_20240101@120000.png)
    pattern = os.path.join(result_dir, "step_*.png")
@@ -308,8 +309,9 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:

    if not screenshot_files:
        logger.warning(f"No screenshot files found in {result_dir}")
-        return screenshots
+        return screenshots, filenames

+    import re as _re
    for filepath in screenshot_files:
        try:
            with open(filepath, "rb") as f:
@@ -321,11 +323,16 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
                    img_b64 = _compress_image(img_b64, max_size=max_size, quality=quality)

                screenshots.append(img_b64)
+                # Extract short name like 'step_1' from 'step_1_20240101@120000.png'
+                basename = os.path.basename(filepath)
+                match = _re.match(r'(step_\d+)', basename)
+                short_name = match.group(1) if match else basename
+                filenames.append(short_name)
        except Exception as e:
            logger.error(f"Error loading screenshot {filepath}: {e}")

-    logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}")
-    return screenshots
+    logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}: {filenames}")
+    return screenshots, filenames


 def vllm_eval(result_state, **options) -> float:
@@ -358,8 +365,10 @@ def vllm_eval(result_state, **options) -> float:
    max_image_size = options.get("max_image_size", 800)
    image_quality = options.get("image_quality", 85)

+    screenshot_filenames = []  # Short names like 'step_1', 'step_2', ...
+
    if result_dir and not screenshots:
-        screenshots = _load_screenshots_from_dir(
+        screenshots, screenshot_filenames = _load_screenshots_from_dir(
            result_dir,
            compress=compress_images,
            max_size=max_image_size,
@@ -368,6 +377,7 @@ def vllm_eval(result_state, **options) -> float:
        logger.info(f"Loaded {len(screenshots)} screenshots from result_dir: {result_dir}")
    elif screenshots:
        logger.info(f"Using {len(screenshots)} screenshots from options")
+        screenshot_filenames = [f"step_{i+1}" for i in range(len(screenshots))]
        # Compress screenshots if needed
        if compress_images:
            logger.info("Compressing provided screenshots...")
@@ -375,6 +385,8 @@ def vllm_eval(result_state, **options) -> float:

    instruction = options.get("instruction", "")
    eval_model = options.get("eval_model", "gpt-4-vision-preview")
+    config = options.get("config", [])
+    metadata = options.get("metadata", {})

    params = {
        "temperature": options.get("temperature", 0.7),
@@ -384,32 +396,91 @@ def vllm_eval(result_state, **options) -> float:

    llm = UnifiedLLM(eval_model)

-    prompt = f"""You are an expert evaluator for desktop environment tasks.
+    # Build pre-configured environment description from config
+    preconfig_items = []
+    for cfg in config:
+        if cfg.get("type") == "launch":
+            cmds = cfg.get("parameters", {}).get("command", [])
+            if cmds:
+                app_name = os.path.basename(cmds[0]) if cmds else "unknown"
+                preconfig_items.append(f"Application '{app_name}' was automatically launched before the agent started.")
+        elif cfg.get("type") == "sleep":
+            pass  # not relevant to scoring
+        elif cfg.get("type") == "open":
+            path = cfg.get("parameters", {}).get("path", "")
+            preconfig_items.append(f"File/URL '{path}' was automatically opened before the agent started.")
+
+    preconfig_section = ""
+    if preconfig_items:
+        preconfig_desc = "\n".join(f"  - {item}" for item in preconfig_items)
+        preconfig_section = f"""
+PRE-CONFIGURED ENVIRONMENT (done BEFORE the agent started, NOT the agent's work):
+{preconfig_desc}
+IMPORTANT: The above actions were performed automatically as part of environment setup. The agent did NOT perform these actions. Do NOT give ANY credit for them. For example, if the application was pre-launched, the agent merely having the application open is worth 0 points - that was the starting state."""
+
+    # Build expected steps section from metadata
+    expected_steps_section = ""
+    if metadata.get("steps"):
+        expected_steps_section = f"""
+EXPECTED STEPS for this task (use as reference for what the agent should have done):
+{metadata['steps']}
+NOTE: Evaluate the screenshots against these expected steps. Only give credit for steps that show VISIBLE evidence of completion BEYOND the pre-configured starting state."""
+
+    # Build image list description for the prompt
+    if screenshot_filenames:
+        img_list_str = ", ".join(screenshot_filenames)
+        img_info = f"""\nYou are provided with exactly {len(screenshot_filenames)} screenshots in chronological order: {img_list_str}
+The FIRST screenshot is: {screenshot_filenames[0]}
+The LAST screenshot (final state): {screenshot_filenames[-1]}
+IMPORTANT: Only reference screenshots from the list above. Do NOT reference any screenshot that is not listed."""
+    else:
+        img_info = "\nNo screenshots were provided."
+
+    prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots.

 Task Instruction: {instruction}
+{preconfig_section}
+{expected_steps_section}
+{img_info}

-I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.
+Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context.
+
+CRITICAL SCORING RULES:
+1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points.
+2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress.
+3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0.
+4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress.
+5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal.
+
+SCORING GUIDE (0-10):
+- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0.
+- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal.
+- 3-4: Some initial steps toward the task have been taken but the task is far from complete.
+- 5-6: Significant progress - about half the required steps are completed with visible evidence.
+- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues.
+- 9: The task is essentially complete with very minor cosmetic differences.
+- 10: The task is perfectly and completely finished with clear evidence in the final screenshot.

 IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:

 {{
  "steps_analysis": [
-    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
-    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
+    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}},
+    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}}
  ],
  "final_completion": "True/False",
  "score": 0-10
 }}

 Where:
- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
+- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps.
 - "status": Either "Success" or "Fail" for each step
 - "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Brief explanation of why this step succeeded or failed
- "final_completion": "True" if the overall task is completed, "False" otherwise
- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
+- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence
+- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise
+- "score": Integer from 0 to 10, following the strict scoring guide above

-Remember: Return ONLY the JSON object, no additional text."""
+Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER."""

    try:
        result = llm.generate_with_images(