diff --git a/desktop_env/evaluators/metrics/vllm_eval.py b/desktop_env/evaluators/metrics/vllm_eval.py index d7b971f..d994fce 100644 --- a/desktop_env/evaluators/metrics/vllm_eval.py +++ b/desktop_env/evaluators/metrics/vllm_eval.py @@ -436,7 +436,7 @@ def vllm_eval(result_state, **options) -> float: result_state: Current state description **options: Additional options including: - result_dir: Path to result directory containing step screenshots (recommended) - - screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead) + - screenshots: List of base64 encoded screenshots (deprecated, use result_dir instead) - instruction: Task instruction - eval_model: Model name to use - batch_size: Number of images per batch (default: 3) @@ -469,16 +469,32 @@ def vllm_eval(result_state, **options) -> float: llm = UnifiedLLM(eval_model) - prompt = f"""You are an expert evaluator for desktop environment tasks. + prompt = f"""You are an expert evaluator for desktop environment tasks. Task Instruction: {instruction} -I will provide you with screenshot(s) showing the current state of the desktop environment. Based on the instruction and screenshots, provide a concise evaluation score from 0.0 to 1.0, where: -- 1.0 means the task is perfectly completed -- 0.0 means the task is not completed at all -- Values in between represent partial completion +I will provide you with screenshot(s) showing the current state of the desktop environment. Please analyze the task execution step by step and provide a detailed evaluation. -Please return your response in the format: "Score: X.X" followed by a brief explanation.""" +IMPORTANT: You must respond with ONLY a valid JSON object (no additional text before or after). Use the following exact format: + +{{ + "steps_analysis": [ + {{"step": "Step description", "status": "Success/Fail", "evidence_img": "step_X.png", "reason": "Brief explanation"}}, + {{"step": "Another step", "status": "Success/Fail", "evidence_img": "step_Y.png", "reason": "Brief explanation"}} + ], + "final_completion": "True/False", + "score": 0-10 +}} + +Where: +- "steps_analysis": Array of steps you identified from the screenshots (reference screenshot filenames like step_1.png, step_2.png, etc.) +- "status": Either "Success" or "Fail" for each step +- "evidence_img": The screenshot filename that shows evidence for this step (e.g., "step_2.png") +- "reason": Brief explanation of why this step succeeded or failed +- "final_completion": "True" if the overall task is completed, "False" otherwise +- "score": Integer from 0 to 10, where 10 means perfectly completed and 0 means not completed at all + +Remember: Return ONLY the JSON object, no additional text.""" try: result = llm.generate_with_images( @@ -493,25 +509,107 @@ Please return your response in the format: "Score: X.X" followed by a brief expl logger.info(f"Evaluation result: {result}") logger.info(f"Parsed score: {score}") + # Save raw result to file for reference + if result_dir: + eval_output_path = os.path.join(result_dir, "vllm_evaluation_result.json") + with open(eval_output_path, "w", encoding="utf-8") as f: + f.write(result) + logger.info(f"Saved evaluation result to {eval_output_path}") + return score except Exception as e: logger.error(f"Error during evaluation: {e}") return 0.0 -def _parse_score(text: str) -> float: - """Parse score from model response""" +def _parse_evaluation_response(text: str) -> Dict[str, Any]: + """ + Parse the JSON evaluation response from the model + + Returns: + Dictionary containing steps_analysis, final_completion, and score + """ import re - - # Look for "Score: X.X" pattern - match = re.search(r'[Ss]core:\s*([0-9]*\.?[0-9]+)', text) - if match: - try: - score = float(match.group(1)) - # Clamp to [0.0, 1.0] - return max(0.0, min(1.0, score)) - except ValueError: - logger.warning(f"Could not parse score from: {match.group(1)}") - - logger.warning(f"No valid score found in response: {text[:200]}") - return 0.0 + import json + + # Try to extract JSON from the response + # Sometimes models wrap JSON in markdown code blocks + text = text.strip() + + # Remove markdown code blocks if present + if text.startswith("```"): + # Extract content between ``` markers + match = re.search(r'```(?:json)?\s*(\{.*?\})\s*```', text, re.DOTALL) + if match: + text = match.group(1) + else: + # Try to remove opening and closing ``` + text = re.sub(r'^```(?:json)?\s*', '', text) + text = re.sub(r'\s*```$', '', text) + + try: + result = json.loads(text) + + # Validate required fields + if "steps_analysis" not in result: + logger.warning("Missing 'steps_analysis' field in response") + result["steps_analysis"] = [] + + if "final_completion" not in result: + logger.warning("Missing 'final_completion' field in response") + result["final_completion"] = "False" + + if "score" not in result: + logger.warning("Missing 'score' field in response") + result["score"] = 0 + + return result + + except json.JSONDecodeError as e: + logger.error(f"Failed to parse JSON response: {e}") + logger.error(f"Response text: {text[:500]}") + + # Return a default structure + return { + "steps_analysis": [], + "final_completion": "False", + "score": 0 + } + + +def _parse_score(text: str) -> float: + """ + Parse score from model response and convert to 0.0-1.0 range + + Args: + text: Raw model response (expected to be JSON format) + + Returns: + Score between 0.0 and 1.0 + """ + result = _parse_evaluation_response(text) + + # Extract score (0-10) and convert to 0.0-1.0 + score = result.get("score", 0) + + try: + score = float(score) + # Clamp to [0, 10] then normalize to [0.0, 1.0] + score = max(0.0, min(10.0, score)) + normalized_score = score / 10.0 + + logger.info(f"Final completion: {result.get('final_completion')}") + logger.info(f"Raw score (0-10): {score}, Normalized score (0-1): {normalized_score}") + + # Log steps analysis if available + steps = result.get("steps_analysis", []) + if steps: + logger.info(f"Steps analysis ({len(steps)} steps):") + for i, step in enumerate(steps): + logger.info(f" Step {i+1}: {step.get('step', 'N/A')} - {step.get('status', 'N/A')}") + + return normalized_score + + except (ValueError, TypeError) as e: + logger.warning(f"Could not parse score: {e}") + return 0.0