feat(vllm_eval): add structured JSON response format with step analysis
This commit is contained in:
@@ -436,7 +436,7 @@ def vllm_eval(result_state, **options) -> float:
|
||||
result_state: Current state description
|
||||
**options: Additional options including:
|
||||
- result_dir: Path to result directory containing step screenshots (recommended)
|
||||
- screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
|
||||
- screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead)
|
||||
- instruction: Task instruction
|
||||
- eval_model: Model name to use
|
||||
- batch_size: Number of images per batch (default: 3)
|
||||
@@ -469,16 +469,32 @@ def vllm_eval(result_state, **options) -> float:
|
||||
|
||||
llm = UnifiedLLM(eval_model)
|
||||
|
||||
prompt = f"""You are an expert evaluator for desktop environment tasks.
|
||||
prompt = f"""You are an expert evaluator for desktop environment tasks.
|
||||
|
||||
Task Instruction: {instruction}
|
||||
|
||||
I will provide you with screenshot(s) showing the current state of the desktop environment. Based on the instruction and screenshots, provide a concise evaluation score from 0.0 to 1.0, where:
|
||||
- 1.0 means the task is perfectly completed
|
||||
- 0.0 means the task is not completed at all
|
||||
- Values in between represent partial completion
|
||||
I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation.
|
||||
|
||||
Please return your response in the format: "Score: X.X" followed by a brief explanation."""
|
||||
IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format:
|
||||
|
||||
{{
|
||||
"steps_analysis": [
|
||||
{{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}},
|
||||
{{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}}
|
||||
],
|
||||
"final_completion": "True/False",
|
||||
"score": 0-10
|
||||
}}
|
||||
|
||||
Where:
|
||||
- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.)
|
||||
- "status": Either "Success" or "Fail" for each step
|
||||
- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png")
|
||||
- "reason": Brief explanation of why this step succeeded or failed
|
||||
- "final_completion": "True" if the overall task is completed, "False" otherwise
|
||||
- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all
|
||||
|
||||
Remember: Return ONLY the JSON object, no additional text."""
|
||||
|
||||
try:
|
||||
result = llm.generate_with_images(
|
||||
@@ -493,25 +509,107 @@ Please return your response in the format: "Score: X.X" followed by a brief expl
|
||||
logger.info(f"Evaluation result: {result}")
|
||||
logger.info(f"Parsed score: {score}")
|
||||
|
||||
# Save raw result to file for reference
|
||||
if result_dir:
|
||||
eval_output_path = os.path.join(result_dir, "vllm_evaluation_result.json")
|
||||
with open(eval_output_path, "w", encoding="utf-8") as f:
|
||||
f.write(result)
|
||||
logger.info(f"Saved evaluation result to {eval_output_path}")
|
||||
|
||||
return score
|
||||
except Exception as e:
|
||||
logger.error(f"Error during evaluation: {e}")
|
||||
return 0.0
|
||||
|
||||
|
||||
def _parse_score(text: str) -> float:
|
||||
"""Parse score from model response"""
|
||||
def _parse_evaluation_response(text: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse the JSON evaluation response from the model
|
||||
|
||||
Returns:
|
||||
Dictionary containing steps_analysis, final_completion, and score
|
||||
"""
|
||||
import re
|
||||
|
||||
# Look for "Score: X.X" pattern
|
||||
match = re.search(r'[Ss]core:\s*([0-9]*\.?[0-9]+)', text)
|
||||
if match:
|
||||
try:
|
||||
score = float(match.group(1))
|
||||
# Clamp to [0.0, 1.0]
|
||||
return max(0.0, min(1.0, score))
|
||||
except ValueError:
|
||||
logger.warning(f"Could not parse score from: {match.group(1)}")
|
||||
|
||||
logger.warning(f"No valid score found in response: {text[:200]}")
|
||||
return 0.0
|
||||
import json
|
||||
|
||||
# Try to extract JSON from the response
|
||||
# Sometimes models wrap JSON in markdown code blocks
|
||||
text = text.strip()
|
||||
|
||||
# Remove markdown code blocks if present
|
||||
if text.startswith("```"):
|
||||
# Extract content between ``` markers
|
||||
match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL)
|
||||
if match:
|
||||
text = match.group(1)
|
||||
else:
|
||||
# Try to remove opening and closing ```
|
||||
text = re.sub(r'^```(?:json)?\s*', '', text)
|
||||
text = re.sub(r'\s*```$', '', text)
|
||||
|
||||
try:
|
||||
result = json.loads(text)
|
||||
|
||||
# Validate required fields
|
||||
if "steps_analysis" not in result:
|
||||
logger.warning("Missing 'steps_analysis' field in response")
|
||||
result["steps_analysis"] = []
|
||||
|
||||
if "final_completion" not in result:
|
||||
logger.warning("Missing 'final_completion' field in response")
|
||||
result["final_completion"] = "False"
|
||||
|
||||
if "score" not in result:
|
||||
logger.warning("Missing 'score' field in response")
|
||||
result["score"] = 0
|
||||
|
||||
return result
|
||||
|
||||
except json.JSONDecodeError as e:
|
||||
logger.error(f"Failed to parse JSON response: {e}")
|
||||
logger.error(f"Response text: {text[:500]}")
|
||||
|
||||
# Return a default structure
|
||||
return {
|
||||
"steps_analysis": [],
|
||||
"final_completion": "False",
|
||||
"score": 0
|
||||
}
|
||||
|
||||
|
||||
def _parse_score(text: str) -> float:
|
||||
"""
|
||||
Parse score from model response and convert to 0.0-1.0 range
|
||||
|
||||
Args:
|
||||
text: Raw model response (expected to be JSON format)
|
||||
|
||||
Returns:
|
||||
Score between 0.0 and 1.0
|
||||
"""
|
||||
result = _parse_evaluation_response(text)
|
||||
|
||||
# Extract score (0-10) and convert to 0.0-1.0
|
||||
score = result.get("score", 0)
|
||||
|
||||
try:
|
||||
score = float(score)
|
||||
# Clamp to [0, 10] then normalize to [0.0, 1.0]
|
||||
score = max(0.0, min(10.0, score))
|
||||
normalized_score = score / 10.0
|
||||
|
||||
logger.info(f"Final completion: {result.get('final_completion')}")
|
||||
logger.info(f"Raw score (0-10): {score}, Normalized score (0-1): {normalized_score}")
|
||||
|
||||
# Log steps analysis if available
|
||||
steps = result.get("steps_analysis", [])
|
||||
if steps:
|
||||
logger.info(f"Steps analysis ({len(steps)} steps):")
|
||||
for i, step in enumerate(steps):
|
||||
logger.info(f" Step {i+1}: {step.get('step', 'N/A')} - {step.get('status', 'N/A')}")
|
||||
|
||||
return normalized_score
|
||||
|
||||
except (ValueError, TypeError) as e:
|
||||
logger.warning(f"Could not parse score: {e}")
|
||||
return 0.0
|
||||
|
||||
Reference in New Issue
Block a user