feat(vllm_eval): add structured JSON response format with step analysis

This commit is contained in:
cui0711
2026-02-09 13:58:14 +08:00
parent 1e9281a1ab
commit 9bc54c0a66

View File

@@ -436,7 +436,7 @@ def vllm_eval(result_state, **options) -> float:
result_state: Current state description
**options: Additional options including:
- result_dir: Path to result directory containing step screenshots (recommended)
- screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
- screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
- instruction: Task instruction
- eval_model: Model name to use
- batch_size: Number of images per batch (default: 3)
@@ -469,16 +469,32 @@ def vllm_eval(result_state, **options) -> float:
llm = UnifiedLLM(eval_model)
prompt = f"""You are an expert evaluator for desktop environment tasks.
prompt = f"""You are an expert evaluator for desktop environment tasks.
Task Instruction: {instruction}
I will provide you with screenshot(s) showing the current state of the desktop environment. Based on the instruction and screenshots, provide a concise evaluation score from 0.0 to 1.0, where:
- 1.0 means the task is perfectly completed
- 0.0 means the task is not completed at all
- Values in between represent partial completion
I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.
Please return your response in the format: "Score: X.X" followed by a brief explanation."""
IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
{{
"steps_analysis": [
{{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
{{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
],
"final_completion": "True/False",
"score": 0-10
}}
Where:
- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
- "status": Either "Success" or "Fail" for each step
- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
- "reason": Brief explanation of why this step succeeded or failed
- "final_completion": "True" if the overall task is completed, "False" otherwise
- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
Remember: Return ONLY the JSON object, no additional text."""
try:
result = llm.generate_with_images(
@@ -493,25 +509,107 @@ Please return your response in the format: "Score: X.X" followed by a brief expl
logger.info(f"Evaluation result: {result}")
logger.info(f"Parsed score: {score}")
# Save raw result to file for reference
if result_dir:
eval_output_path = os.path.join(result_dir, "vllm_evaluation_result.json")
with open(eval_output_path, "w", encoding="utf-8") as f:
f.write(result)
logger.info(f"Saved evaluation result to {eval_output_path}")
return score
except Exception as e:
logger.error(f"Error during evaluation: {e}")
return 0.0
def _parse_score(text: str) -> float:
"""Parse score from model response"""
def _parse_evaluation_response(text: str) -> Dict[str, Any]:
"""
Parse the JSON evaluation response from the model
Returns:
Dictionary containing steps_analysis, final_completion, and score
"""
import re
# Look for "Score: X.X" pattern
match = re.search(r'[Ss]core:\s*([0-9]*\.?[0-9]+)', text)
if match:
try:
score = float(match.group(1))
# Clamp to [0.0, 1.0]
return max(0.0, min(1.0, score))
except ValueError:
logger.warning(f"Could not parse score from: {match.group(1)}")
logger.warning(f"No valid score found in response: {text[:200]}")
return 0.0
import json
# Try to extract JSON from the response
# Sometimes models wrap JSON in markdown code blocks
text = text.strip()
# Remove markdown code blocks if present
if text.startswith("```"):
# Extract content between ``` markers
match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
if match:
text = match.group(1)
else:
# Try to remove opening and closing ```
text = re.sub(r'^```(?:json)?\s*', '', text)
text = re.sub(r'\s*```$', '', text)
try:
result = json.loads(text)
# Validate required fields
if "steps_analysis" not in result:
logger.warning("Missing 'steps_analysis' field in response")
result["steps_analysis"] = []
if "final_completion" not in result:
logger.warning("Missing 'final_completion' field in response")
result["final_completion"] = "False"
if "score" not in result:
logger.warning("Missing 'score' field in response")
result["score"] = 0
return result
except json.JSONDecodeError as e:
logger.error(f"Failed to parse JSON response: {e}")
logger.error(f"Response text: {text[:500]}")
# Return a default structure
return {
"steps_analysis": [],
"final_completion": "False",
"score": 0
}
def _parse_score(text: str) -> float:
"""
Parse score from model response and convert to 0.0-1.0 range
Args:
text: Raw model response (expected to be JSON format)
Returns:
Score between 0.0 and 1.0
"""
result = _parse_evaluation_response(text)
# Extract score (0-10) and convert to 0.0-1.0
score = result.get("score", 0)
try:
score = float(score)
# Clamp to [0, 10] then normalize to [0.0, 1.0]
score = max(0.0, min(10.0, score))
normalized_score = score / 10.0
logger.info(f"Final completion: {result.get('final_completion')}")
logger.info(f"Raw score (0-10): {score}, Normalized score (0-1): {normalized_score}")
# Log steps analysis if available
steps = result.get("steps_analysis", [])
if steps:
logger.info(f"Steps analysis ({len(steps)} steps):")
for i, step in enumerate(steps):
logger.info(f" Step {i+1}: {step.get('step', 'N/A')} - {step.get('status', 'N/A')}")
return normalized_score
except (ValueError, TypeError) as e:
logger.warning(f"Could not parse score: {e}")
return 0.0