feat(vllm_eval): add structured JSON response format with step analysis

2026-02-09 13:58:14 +08:00
parent 1e9281a1ab
commit 9bc54c0a66
1 changed files with 120 additions and 22 deletions
--- a/desktop_env/evaluators/metrics/vllm_eval.py
+++ b/desktop_env/evaluators/metrics/vllm_eval.py
@@ -436,7 +436,7 @@ def vllm_eval(result_state, **options) -> float:
        result_state: Current state description
        **options: Additional options including:
            - result_dir: Path to result directory containing step screenshots (recommended)
-            - screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
+         - screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
            - instruction: Task instruction
            - eval_model: Model name to use
            - batch_size: Number of images per batch (default: 3)
@@ -469,16 +469,32 @@ def vllm_eval(result_state, **options) -> float:
    
    llm = UnifiedLLM(eval_model)
    
-    prompt = f"""You are an expert evaluator for desktop environment tasks. 
+    prompt = f"""You are an expert evaluator for desktop environment tasks.

 Task Instruction: {instruction}

-I will provide you with screenshot(s) showing the current state of the desktop environment. Based on the instruction and screenshots, provide a concise evaluation score from 0.0 to 1.0, where:
- 1.0 means the task is perfectly completed
- 0.0 means the task is not completed at all
- Values in between represent partial completion
+I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.

-Please return your response in the format: "Score: X.X" followed by a brief explanation."""
+IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
+
+{{
+  "steps_analysis": [
+    {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
+    {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
+  ],
+  "final_completion": "True/False",
+  "score": 0-10
+}}
+
+Where:
+- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
+- "status": Either "Success" or "Fail" for each step
+- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
+- "reason": Brief explanation of why this step succeeded or failed
+- "final_completion": "True" if the overall task is completed, "False" otherwise
+- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
+
+Remember: Return ONLY the JSON object, no additional text."""
    
    try:
        result = llm.generate_with_images(
@@ -493,25 +509,107 @@ Please return your response in the format: "Score: X.X" followed by a brief expl
        logger.info(f"Evaluation result: {result}")
        logger.info(f"Parsed score: {score}")
        
+        # Save raw result to file for reference
+        if result_dir:
+            eval_output_path = os.path.join(result_dir, "vllm_evaluation_result.json")
+            with open(eval_output_path, "w", encoding="utf-8") as f:
+                f.write(result)
+            logger.info(f"Saved evaluation result to {eval_output_path}")
+        
        return score
    except Exception as e:
        logger.error(f"Error during evaluation: {e}")
        return 0.0


-def _parse_score(text: str) -> float:
-    """Parse score from model response"""
+def _parse_evaluation_response(text: str) -> Dict[str, Any]:
+    """
+    Parse the JSON evaluation response from the model
+
+    Returns:
+        Dictionary containing steps_analysis, final_completion, and score
+    """
    import re
-    
-    # Look for "Score: X.X" pattern
-    match = re.search(r'[Ss]core:\s*([0-9]*\.?[0-9]+)', text)
-    if match:
-        try:
-            score = float(match.group(1))
-            # Clamp to [0.0, 1.0]
-            return max(0.0, min(1.0, score))
-        except ValueError:
-            logger.warning(f"Could not parse score from: {match.group(1)}")
-    
-    logger.warning(f"No valid score found in response: {text[:200]}")
-    return 0.0
+    import json
+
+    # Try to extract JSON from the response
+    # Sometimes models wrap JSON in markdown code blocks
+    text = text.strip()
+
+    # Remove markdown code blocks if present
+    if text.startswith("```"):
+        # Extract content between ``` markers
+        match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
+        if match:
+            text = match.group(1)
+        else:
+            # Try to remove opening and closing ```
+            text = re.sub(r'^```(?:json)?\s*', '', text)
+            text = re.sub(r'\s*```$', '', text)
+
+    try:
+        result = json.loads(text)
+
+        # Validate required fields
+        if "steps_analysis" not in result:
+            logger.warning("Missing 'steps_analysis' field in response")
+            result["steps_analysis"] = []
+
+        if "final_completion" not in result:
+            logger.warning("Missing 'final_completion' field in response")
+            result["final_completion"] = "False"
+
+        if "score" not in result:
+            logger.warning("Missing 'score' field in response")
+            result["score"] = 0
+
+        return result
+
+    except json.JSONDecodeError as e:
+        logger.error(f"Failed to parse JSON response: {e}")
+        logger.error(f"Response text: {text[:500]}")
+
+        # Return a default structure
+        return {
+            "steps_analysis": [],
+            "final_completion": "False",
+            "score": 0
+        }
+
+
+def _parse_score(text: str) -> float:
+    """
+    Parse score from model response and convert to 0.0-1.0 range
+
+    Args:
+        text: Raw model response (expected to be JSON format)
+
+    Returns:
+        Score between 0.0 and 1.0
+    """
+    result = _parse_evaluation_response(text)
+
+    # Extract score (0-10) and convert to 0.0-1.0
+    score = result.get("score", 0)
+
+    try:
+        score = float(score)
+        # Clamp to [0, 10] then normalize to [0.0, 1.0]
+        score = max(0.0, min(10.0, score))
+        normalized_score = score / 10.0
+
+        logger.info(f"Final completion: {result.get('final_completion')}")
+        logger.info(f"Raw score (0-10): {score}, Normalized score (0-1): {normalized_score}")
+
+        # Log steps analysis if available
+        steps = result.get("steps_analysis", [])
+        if steps:
+            logger.info(f"Steps analysis ({len(steps)} steps):")
+            for i, step in enumerate(steps):
+                logger.info(f"  Step {i+1}: {step.get('step', 'N/A')} - {step.get('status', 'N/A')}")
+
+        return normalized_score
+
+    except (ValueError, TypeError) as e:
+        logger.warning(f"Could not parse score: {e}")
+        return 0.0