feat: 新增科研软件 benchmark 任务数据

- 新增 avogadro/imagej/jade/origin/ovito/pymol/vesta 等科研软件任务 JSON - 修改 vllm_eval.py，修改图片文件名称为第x步 - desktop_env.py 添加额外数据参数 config 和 metadata
2026-02-25 15:19:36 +08:00
parent 613f55f0da
commit 9899d4a0c7
85 changed files with 4703 additions and 71 deletions
--- a/desktop_env/desktop_env.py
+++ b/desktop_env/desktop_env.py
@@ -20,42 +20,42 @@ Metric = Callable[[Any, Any], float]
 Getter = Callable[[gym.Env, Dict[str, Any]], Any]

 MAX_RETRIES = 5 # Maximum retries for environment setup
-            
+


 def _fix_pyautogui_less_than_bug(command: str) -> str:
    """
    Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
-    
+
    This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
    References:
    - https://github.com/asweigart/pyautogui/issues/198
    - https://github.com/xlang-ai/OSWorld/issues/257
-    
+
    Args:
        command (str): The original pyautogui command
-        
+
    Returns:
        str: The fixed command with '<' characters handled properly
    """
-    # Pattern to match press('<') or press('\u003c') calls  
+    # Pattern to match press('<') or press('\u003c') calls
    press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'

    # Handle press('<') calls
    def replace_press_less_than(match):
        return 'pyautogui.hotkey("shift", ",")'
-    
+
    # First handle press('<') calls
    command = re.sub(press_pattern, replace_press_less_than, command)

    # Pattern to match typewrite calls with quoted strings
    typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
-    
+
    # Then handle typewrite calls
    def process_typewrite_match(match):
        quote_char = match.group(1)
        content = match.group(2)
-        
+
        # Preprocess: Try to decode Unicode escapes like \u003c to actual '<'
        # This handles cases where '<' is represented as escaped Unicode
        try:
@@ -65,15 +65,15 @@ def _fix_pyautogui_less_than_bug(command: str) -> str:
        except UnicodeDecodeError:
            # If decoding fails, proceed with original content to avoid breaking existing logic
            pass  # English comment: Graceful degradation - fall back to original content if decoding fails
-        
+
        # Check if content contains '<'
        if '<' not in content:
            return match.group(0)
-        
+
        # Split by '<' and rebuild
        parts = content.split('<')
        result_parts = []
-        
+
        for i, part in enumerate(parts):
            if i == 0:
                # First part
@@ -84,11 +84,11 @@ def _fix_pyautogui_less_than_bug(command: str) -> str:
                result_parts.append('pyautogui.hotkey("shift", ",")')
                if part:
                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
-        
+
        return '; '.join(result_parts)
-    
+
    command = re.sub(typewrite_pattern, process_typewrite_match, command)
-    
+
    return command


@@ -145,12 +145,12 @@ class DesktopEnv(gym.Env):
        self.screen_width = screen_size[0]
        self.screen_height = screen_size[1]

-        # Default 
+        # Default
        self.server_port = 5000
        self.chromium_port = 9222
        self.vnc_port = 8006
        self.vlc_port = 8080
-        
+
        # Initialize with default (no proxy) provider
        self.current_use_proxy = False
        self.manager, self.provider = create_vm_manager_and_provider(provider_name, region, use_proxy=False)
@@ -173,7 +173,7 @@ class DesktopEnv(gym.Env):
                if provider_name in {"vmware", "virtualbox"} else path_to_vm
        else:
            self.path_to_vm = self.manager.get_vm_path(os_type=self.os_type, region=region, screen_size=(self.screen_width, self.screen_height))
-        
+
        self.snapshot_name = snapshot_name
        self.cache_dir_base: str = cache_dir
        # todo: add the logic to get the screen size from the VM
@@ -229,8 +229,8 @@ class DesktopEnv(gym.Env):
        # due to the fact it could be changed when implemented by cloud services
        path_to_vm = self.provider.revert_to_snapshot(self.path_to_vm, self.snapshot_name)
        if path_to_vm and not path_to_vm == self.path_to_vm:
-            # path_to_vm has to be a new path 
-            
+            # path_to_vm has to be a new path
+
            self.manager.delete_vm(self.path_to_vm, self.region)
            self.manager.add_vm(path_to_vm, self.region)
            self.manager.occupy_vm(path_to_vm, os.getpid(), self.region)
@@ -245,7 +245,7 @@ class DesktopEnv(gym.Env):
        self.provider.stop_emulator(self.path_to_vm)

    def reset(self, task_config: Optional[Dict[str, Any]] = None, seed=None, options=None) -> Dict[str, Any]:
-        
+
        # Reset to certain task in OSWorld
        logger.info("Resetting environment...")
        logger.info("Switching task...")
@@ -258,17 +258,17 @@ class DesktopEnv(gym.Env):
            # Only revert to snapshot if environment has been used (step/setup)
            # This optimization is especially important for cloud providers like AWS
            # where unnecessary snapshot operations are costly and time-consuming
-            
+
            if task_config is not None:
                # Only consider task proxy requirement if proxy is enabled at system level
                task_use_proxy = task_config.get("proxy", False) and self.enable_proxy
                if not self.enable_proxy and task_config.get("proxy", False):
                    logger.info("Task requires proxy but proxy is disabled at system level, ignoring proxy requirement.")
-                
+
                if task_use_proxy != self.current_use_proxy:
                    # keep because get_info_from_website depend on this
                    self.current_use_proxy = task_use_proxy
-            
+
            if self.is_environment_used:
                logger.info("Environment has been used, reverting to snapshot: {}...".format(self.snapshot_name))
                self._revert_to_snapshot()
@@ -302,7 +302,7 @@ class DesktopEnv(gym.Env):
                    time.sleep(5)
            else:
                break
-            
+
        logger.info("Environment setup complete.")

        observation = self._get_obs()
@@ -333,7 +333,8 @@ class DesktopEnv(gym.Env):
        os.makedirs(self.cache_dir, exist_ok=True)
        self.instruction = task_config["instruction"]
        self.config = task_config["config"] if "config" in task_config else []
-        
+        self.metadata = task_config.get("metadata", {})
+
        self._set_evaluator_info(task_config)

    def _set_evaluator_info(self, task_config: Dict[str, Any]):
@@ -386,7 +387,7 @@ class DesktopEnv(gym.Env):
    def step(self, action, pause=2):
        self._step_no += 1
        self.action_history.append(action)
-        
+
        # Mark environment as used when step is called
        self.is_environment_used = True

@@ -461,12 +462,16 @@ class DesktopEnv(gym.Env):
            self.metric_options["instruction"] = self.instruction
            self.metric_options["eval_model"] = self.eval_model

+            # Pass pre-configured environment info and expected steps
+            self.metric_options["config"] = self.config
+            self.metric_options["metadata"] = self.metadata
+
            if result_dir:
                self.metric_options["result_dir"] = result_dir
                logger.info(f"Using result_dir for vllm_eval: {result_dir}")

            logger.info(f"Evaluation options prepared: {self.metric_options.keys()}")
-            
+
        if type(self.metric) == list:
            # Multiple metrics to evaluate whether the task is successfully completed
            results = []
--- a/desktop_env/evaluators/metrics/vllm_eval.py
+++ b/desktop_env/evaluators/metrics/vllm_eval.py
@@ -287,7 +287,7 @@ class UnifiedLLM:
            raise ValueError(f"Unsupported provider: {self.provider}")


-def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> List[str]:
+def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size: int = 800, quality: int = 85) -> tuple:
    """
    Load all step screenshots from result directory and convert to base64

@@ -298,9 +298,10 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
        quality: JPEG quality for compression (default: 85)

    Returns:
-        List of base64 encoded screenshot strings
+        Tuple of (list of base64 encoded screenshot strings, list of short filenames like 'step_1', 'step_2', ...)
    """
    screenshots = []
+    filenames = []

    # Find all step screenshot files (e.g., step_1_20240101@120000.png)
    pattern = os.path.join(result_dir, "step_*.png")
@@ -308,8 +309,9 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:

    if not screenshot_files:
        logger.warning(f"No screenshot files found in {result_dir}")
-        return screenshots
+        return screenshots, filenames

+    import re as _re
    for filepath in screenshot_files:
        try:
            with open(filepath, "rb") as f:
@@ -321,11 +323,16 @@ def _load_screenshots_from_dir(result_dir: str, compress: bool = True, max_size:
                    img_b64 = _compress_image(img_b64, max_size=max_size, quality=quality)

                screenshots.append(img_b64)
+                # Extract short name like 'step_1' from 'step_1_20240101@120000.png'
+                basename = os.path.basename(filepath)
+                match = _re.match(r'(step_\d+)', basename)
+                short_name = match.group(1) if match else basename
+                filenames.append(short_name)
        except Exception as e:
            logger.error(f"Error loading screenshot {filepath}: {e}")

-    logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}")
-    return screenshots
+    logger.info(f"Loaded {len(screenshots)} screenshots from {result_dir}: {filenames}")
+    return screenshots, filenames


 def vllm_eval(result_state, **options) -> float:
@@ -358,8 +365,10 @@ def vllm_eval(result_state, **options) -> float:
    max_image_size = options.get("max_image_size", 800)
    image_quality = options.get("image_quality", 85)

+    screenshot_filenames = []  # Short names like 'step_1', 'step_2', ...
+
    if result_dir and not screenshots:
-        screenshots = _load_screenshots_from_dir(
+        screenshots, screenshot_filenames = _load_screenshots_from_dir(
            result_dir,
            compress=compress_images,
            max_size=max_image_size,
@@ -368,6 +377,7 @@ def vllm_eval(result_state, **options) -> float:
        logger.info(f"Loaded {len(screenshots)} screenshots from result_dir: {result_dir}")
    elif screenshots:
        logger.info(f"Using {len(screenshots)} screenshots from options")
+        screenshot_filenames = [f"step_{i+1}" for i in range(len(screenshots))]
        # Compress screenshots if needed
        if compress_images:
            logger.info("Compressing provided screenshots...")
@@ -375,6 +385,8 @@ def vllm_eval(result_state, **options) -> float:

    instruction = options.get("instruction", "")
    eval_model = options.get("eval_model", "gpt-4-vision-preview")
+    config = options.get("config", [])
+    metadata = options.get("metadata", {})

    params = {
        "temperature": options.get("temperature", 0.7),
@@ -384,32 +396,91 @@ def vllm_eval(result_state, **options) -> float:

    llm = UnifiedLLM(eval_model)

-    prompt = f"""You are an expert evaluator for desktop environment tasks.
+    # Build pre-configured environment description from config
+    preconfig_items = []
+    for cfg in config:
+        if cfg.get("type") == "launch":
+            cmds = cfg.get("parameters", {}).get("command", [])
+            if cmds:
+                app_name = os.path.basename(cmds[0]) if cmds else "unknown"
+                preconfig_items.append(f"Application '{app_name}' was automatically launched before the agent started.")
+        elif cfg.get("type") == "sleep":
+            pass  # not relevant to scoring
+        elif cfg.get("type") == "open":
+            path = cfg.get("parameters", {}).get("path", "")
+            preconfig_items.append(f"File/URL '{path}' was automatically opened before the agent started.")
+
+    preconfig_section = ""
+    if preconfig_items:
+        preconfig_desc = "\n".join(f"  - {item}" for item in preconfig_items)
+        preconfig_section = f"""
+PRE-CONFIGURED ENVIRONMENT (done BEFORE the agent started, NOT the agent's work):
+{preconfig_desc}
+IMPORTANT: The above actions were performed automatically as part of environment setup. The agent did NOT perform these actions. Do NOT give ANY credit for them. For example, if the application was pre-launched, the agent merely having the application open is worth 0 points - that was the starting state."""
+
+    # Build expected steps section from metadata
+    expected_steps_section = ""
+    if metadata.get("steps"):
+        expected_steps_section = f"""
+EXPECTED STEPS for this task (use as reference for what the agent should have done):
+{metadata['steps']}
+NOTE: Evaluate the screenshots against these expected steps. Only give credit for steps that show VISIBLE evidence of completion BEYOND the pre-configured starting state."""
+
+    # Build image list description for the prompt
+    if screenshot_filenames:
+        img_list_str = ", ".join(screenshot_filenames)
+        img_info = f"""\nYou are provided with exactly {len(screenshot_filenames)} screenshots in chronological order: {img_list_str}
+The FIRST screenshot is: {screenshot_filenames[0]}
+The LAST screenshot (final state): {screenshot_filenames[-1]}
+IMPORTANT: Only reference screenshots from the list above. Do NOT reference any screenshot that is not listed."""
+    else:
+        img_info = "\nNo screenshots were provided."
+
+    prompt = f"""You are a STRICT and RIGOROUS evaluator for desktop environment tasks. Your job is to score ONLY based on concrete, visible evidence of task completion in the screenshots.

 Task Instruction: {instruction}
+{preconfig_section}
+{expected_steps_section}
+{img_info}

-I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.
+Analyze ONLY the FINAL screenshot ({screenshot_filenames[-1] if screenshot_filenames else 'N/A'}) to determine the end state, while using earlier screenshots for context.
+
+CRITICAL SCORING RULES:
+1. Score ONLY based on what the AGENT actually accomplished. The pre-configured environment (application already launched, files already opened, etc.) is the STARTING STATE and worth 0 points.
+2. Score ONLY based on what is ACTUALLY VISIBLE in the screenshots. Do NOT give credit for assumed or potential progress.
+3. If the screenshots show NO meaningful action beyond the initial pre-configured state, the score MUST be 0.
+4. Do NOT give partial credit for "having the system on", "desktop being visible", "the application being open" (if it was pre-launched), or "the application being installed". These are prerequisites or pre-configured state, NOT progress.
+5. Each point must correspond to a SPECIFIC, VERIFIABLE action that was successfully completed BY THE AGENT toward the task goal.
+
+SCORING GUIDE (0-10):
+- 0: No progress beyond the pre-configured starting state. If the app was pre-launched, merely having it open is 0. If the screenshots only show the desktop or the initial app state without any agent action, score is 0.
+- 1-2: The agent performed one minor action (e.g., clicked on a menu) but did not make meaningful progress toward the task goal.
+- 3-4: Some initial steps toward the task have been taken but the task is far from complete.
+- 5-6: Significant progress - about half the required steps are completed with visible evidence.
+- 7-8: Most steps are completed but the final result is not fully achieved or has minor issues.
+- 9: The task is essentially complete with very minor cosmetic differences.
+- 10: The task is perfectly and completely finished with clear evidence in the final screenshot.

 IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:

 {{
  "steps_analysis": [
-    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
-    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
+    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation of VISIBLE evidence"}},
+    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation of VISIBLE evidence"}}
  ],
  "final_completion": "True/False",
  "score": 0-10
 }}

 Where:
- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
+- "steps_analysis": Array of steps you identified from the screenshots. Each step must cite VISIBLE evidence from a specific screenshot. Do NOT include pre-configured actions as agent steps.
 - "status": Either "Success" or "Fail" for each step
 - "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Brief explanation of why this step succeeded or failed
- "final_completion": "True" if the overall task is completed, "False" otherwise
- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
+- "reason": Explanation of what is VISUALLY observed in the screenshot as evidence
+- "final_completion": "True" ONLY if the overall task is fully completed with clear visual proof, "False" otherwise
+- "score": Integer from 0 to 10, following the strict scoring guide above

-Remember: Return ONLY the JSON object, no additional text."""
+Remember: Return ONLY the JSON object, no additional text. Be STRICT - when in doubt, score LOWER."""

    try:
        result = llm.generate_with_images(