import json import pandas as pd from typing import List, Dict, Any, Tuple from collections import defaultdict, Counter import numpy as np class QuestionFilterAndSelector: """题目筛选和选择器 - 用于构建高质量评测集""" def __init__(self): # 各题型的最低难度阈值(排除过于简单的题目) self.min_difficulty_thresholds = { "calculation": 2, # 计算题:排除难度1的基础计算 "short_answer": 2, # 简答题:排除难度1的简单记忆 "true_false": 2, # 判断题:排除难度1的基础概念判断 "multiple_choice": 2 # 选择题:排除难度1的简单选择 } # 各知识层次的最低难度要求 self.knowledge_level_min_difficulty = { "basic_concepts": 2, # 基础概念至少难度2(需要理解,不只是记忆) "simple_application": 2, # 简单应用至少难度2 "medium_application": 2, # 中等应用至少难度2 "complex_analysis": 3, # 复杂分析至少难度3 "advanced_synthesis": 4 # 高级综合至少难度4 } # 目标题库结构(百分比) self.target_distribution = { "question_types": { "calculation": 0.25, # 25% 计算题 "short_answer": 0.45, # 45% 简答题 "true_false": 0.15, # 15% 判断题 "multiple_choice": 0.15 # 15% 选择题 }, "knowledge_levels": { "basic_concepts": 0.20, # 20% 基础概念(但要求难度2+) "simple_application": 0.25, # 25% 简单应用 "medium_application": 0.30, # 30% 中等应用 "complex_analysis": 0.20, # 20% 复杂分析 "advanced_synthesis": 0.05 # 5% 高级综合 }, "difficulty_levels": { 1: 0.05, # 5% 难度1(仅保留最有价值的) 2: 0.25, # 25% 难度2 3: 0.35, # 35% 难度3 4: 0.25, # 25% 难度4 5: 0.10 # 10% 难度5 } } def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]: """第一步:按质量标准过滤题目""" filtered_questions = [] for q in questions: # 基础质量检查 if not self._is_valid_question(q): continue question_type = q.get('question_type', '') knowledge_level = q.get('knowledge_level', '') difficulty = q.get('difficulty', 0) # 应用题型最低难度阈值 min_type_difficulty = self.min_difficulty_thresholds.get(question_type, 1) if difficulty < min_type_difficulty: continue # 应用知识层次最低难度要求 min_level_difficulty = self.knowledge_level_min_difficulty.get(knowledge_level, 1) if difficulty < min_level_difficulty: continue # 特殊过滤规则 if self._should_exclude_question(q): continue filtered_questions.append(q) return filtered_questions def _is_valid_question(self, q: Dict) -> bool: """检查题目的基本有效性""" required_fields = ['question', 'answer', 'question_type', 'knowledge_level', 'difficulty'] for field in required_fields: if not q.get(field): return False # 检查题目和答案长度(排除过短的) if len(q['question'].strip()) < 20: # 题目太短 return False if len(q['answer'].strip()) < 5: # 答案太短 return False return True def _should_exclude_question(self, q: Dict) -> bool: """特殊排除规则""" question_type = q.get('question_type', '') knowledge_level = q.get('knowledge_level', '') difficulty = q.get('difficulty', 0) # 排除一些特定的低质量组合 exclude_combinations = [ # 基础概念+难度1的组合(纯记忆) (knowledge_level == 'basic_concepts' and difficulty == 1), # 选择题+基础概念+低难度的组合 (question_type == 'multiple_choice' and knowledge_level == 'basic_concepts' and difficulty <= 2), # 判断题+基础概念+难度1的组合 (question_type == 'true_false' and knowledge_level == 'basic_concepts' and difficulty == 1), ] return any(exclude_combinations) def smart_sample_questions(self, filtered_questions: List[Dict], target_count: int = 2000) -> List[Dict]: """智能抽样,保持分布平衡""" # 按类别分组 grouped_questions = self._group_questions_by_categories(filtered_questions) # 计算目标数量 target_counts = self._calculate_target_counts(target_count) # 分层抽样 selected_questions = [] # 1. 按题型分层抽样 selected_by_type = self._stratified_sample_by_type( grouped_questions, target_counts, target_count ) # 2. 在每个题型内按知识层次和难度平衡抽样 final_selected = self._balance_within_types(selected_by_type, target_count) return final_selected def _group_questions_by_categories(self, questions: List[Dict]) -> Dict: """按多个维度对题目分组""" grouped = { 'by_type': defaultdict(list), 'by_level': defaultdict(list), 'by_difficulty': defaultdict(list), 'by_type_level': defaultdict(lambda: defaultdict(list)), 'by_type_difficulty': defaultdict(lambda: defaultdict(list)) } for q in questions: qtype = q['question_type'] level = q['knowledge_level'] difficulty = q['difficulty'] grouped['by_type'][qtype].append(q) grouped['by_level'][level].append(q) grouped['by_difficulty'][difficulty].append(q) grouped['by_type_level'][qtype][level].append(q) grouped['by_type_difficulty'][qtype][difficulty].append(q) return grouped def _calculate_target_counts(self, total_target: int) -> Dict: """计算各类别的目标数量""" return { 'by_type': { qtype: int(total_target * ratio) for qtype, ratio in self.target_distribution['question_types'].items() }, 'by_level': { level: int(total_target * ratio) for level, ratio in self.target_distribution['knowledge_levels'].items() }, 'by_difficulty': { diff: int(total_target * ratio) for diff, ratio in self.target_distribution['difficulty_levels'].items() } } def _stratified_sample_by_type(self, grouped_questions: Dict, target_counts: Dict, total_target: int) -> Dict: """按题型分层抽样""" selected_by_type = {} for qtype, target_count in target_counts['by_type'].items(): available_questions = grouped_questions['by_type'].get(qtype, []) if len(available_questions) <= target_count: # 如果可用题目不足,全部选择 selected_by_type[qtype] = available_questions else: # 在该题型内进行智能抽样 selected_by_type[qtype] = self._smart_sample_within_type( available_questions, target_count ) return selected_by_type def _smart_sample_within_type(self, questions: List[Dict], target_count: int) -> List[Dict]: """在单一题型内智能抽样""" # 按难度和知识层次分组 by_difficulty = defaultdict(list) by_level = defaultdict(list) for q in questions: by_difficulty[q['difficulty']].append(q) by_level[q['knowledge_level']].append(q) selected = [] # 优先选择高难度题目 difficulty_priorities = [5, 4, 3, 2, 1] remaining_target = target_count for difficulty in difficulty_priorities: if remaining_target <= 0: break diff_questions = by_difficulty[difficulty] if not diff_questions: continue # 在该难度级别内按知识层次平衡选择 level_groups = defaultdict(list) for q in diff_questions: level_groups[q['knowledge_level']].append(q) # 计算该难度级别应该选多少题 target_for_this_diff = min(remaining_target, int(remaining_target * 0.4) if difficulty >= 4 else int(remaining_target * 0.3)) # 在各知识层次间分配 selected_from_diff = self._distribute_across_levels( level_groups, target_for_this_diff ) selected.extend(selected_from_diff) remaining_target -= len(selected_from_diff) # 如果还没达到目标数量,随机补充 if len(selected) < target_count: remaining_questions = [q for q in questions if q not in selected] additional_needed = target_count - len(selected) if remaining_questions: import random additional = random.sample( remaining_questions, min(additional_needed, len(remaining_questions)) ) selected.extend(additional) return selected[:target_count] def _distribute_across_levels(self, level_groups: Dict, target_count: int) -> List[Dict]: """在知识层次间分配题目""" if not level_groups or target_count <= 0: return [] selected = [] # 知识层次优先级(优先选择更高层次的) level_priorities = [ 'advanced_synthesis', 'complex_analysis', 'medium_application', 'simple_application', 'basic_concepts' ] # 为每个层次分配配额 level_quotas = {} remaining_target = target_count for level in level_priorities: if level not in level_groups or remaining_target <= 0: continue available_count = len(level_groups[level]) if level in ['advanced_synthesis', 'complex_analysis']: quota = min(available_count, max(1, int(remaining_target * 0.4))) elif level == 'medium_application': quota = min(available_count, max(1, int(remaining_target * 0.3))) else: quota = min(available_count, max(1, int(remaining_target * 0.2))) level_quotas[level] = quota remaining_target -= quota # 按配额选择 import random for level, quota in level_quotas.items(): if quota > 0 and level in level_groups: sample_size = min(quota, len(level_groups[level])) selected.extend(random.sample(level_groups[level], sample_size)) return selected def _balance_within_types(self, selected_by_type: Dict, target_count: int) -> List[Dict]: """在题型选择结果间进行最终平衡""" all_selected = [] for questions in selected_by_type.values(): all_selected.extend(questions) # 如果总数超过目标,需要进一步筛选 if len(all_selected) > target_count: # 按综合质量评分排序 scored_questions = [(q, self._calculate_quality_score(q)) for q in all_selected] scored_questions.sort(key=lambda x: x[1], reverse=True) all_selected = [q for q, score in scored_questions[:target_count]] return all_selected def _calculate_quality_score(self, question: Dict) -> float: """计算题目质量评分""" score = 0.0 # 难度权重 difficulty = question.get('difficulty', 1) score += difficulty * 2.0 # 知识层次权重 level_weights = { 'basic_concepts': 1.0, 'simple_application': 2.0, 'medium_application': 3.0, 'complex_analysis': 4.0, 'advanced_synthesis': 5.0 } score += level_weights.get(question.get('knowledge_level', ''), 1.0) # 题型权重(简答题和计算题更有价值) type_weights = { 'short_answer': 2.0, 'calculation': 2.0, 'multiple_choice': 1.5, 'true_false': 1.0 } score += type_weights.get(question.get('question_type', ''), 1.0) # 题目长度权重(更详细的题目更有价值) question_length = len(question.get('question', '')) if question_length > 100: score += 1.0 elif question_length > 200: score += 2.0 return score def analyze_selection_results(self, original_questions: List[Dict], selected_questions: List[Dict]) -> Dict: """分析选择结果""" def get_distribution(questions): total = len(questions) if total == 0: return {} dist = { 'total': total, 'by_type': Counter(q.get('question_type', '') for q in questions), 'by_level': Counter(q.get('knowledge_level', '') for q in questions), 'by_difficulty': Counter(q.get('difficulty', 0) for q in questions), 'avg_difficulty': np.mean([q.get('difficulty', 0) for q in questions]) } # 转换为百分比 for key in ['by_type', 'by_level', 'by_difficulty']: dist[key + '_pct'] = { k: v/total*100 for k, v in dist[key].items() } return dist original_dist = get_distribution(original_questions) selected_dist = get_distribution(selected_questions) return { 'original': original_dist, 'selected': selected_dist, 'selection_ratio': len(selected_questions) / len(original_questions) if original_questions else 0, 'difficulty_improvement': selected_dist['avg_difficulty'] - original_dist['avg_difficulty'] } def print_selection_report(self, analysis_results: Dict): """打印选择报告""" print("\n" + "="*60) print("题目筛选结果报告") print("="*60) original = analysis_results['original'] selected = analysis_results['selected'] print(f"\n📊 基本统计:") print(f"原始题目数: {original['total']}") print(f"筛选后题目数: {selected['total']}") print(f"筛选比例: {analysis_results['selection_ratio']:.1%}") print(f"平均难度提升: {analysis_results['difficulty_improvement']:.2f}") print(f"\n📈 题型分布对比:") print(f"{'题型':<15} {'原始':<10} {'筛选后':<10} {'变化':<10}") print("-" * 50) for qtype in ['calculation', 'short_answer', 'true_false', 'multiple_choice']: orig_pct = original['by_type_pct'].get(qtype, 0) sel_pct = selected['by_type_pct'].get(qtype, 0) change = sel_pct - orig_pct print(f"{qtype:<15} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%") print(f"\n🎯 难度分布对比:") print(f"{'难度':<8} {'原始':<10} {'筛选后':<10} {'变化':<10}") print("-" * 40) for diff in range(1, 6): orig_pct = original['by_difficulty_pct'].get(diff, 0) sel_pct = selected['by_difficulty_pct'].get(diff, 0) change = sel_pct - orig_pct print(f"难度{diff:<3} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%") def main_filter_questions(): """主函数:筛选高质量题目""" # 文件路径 INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_enhanced_classified_questions.json" # 分类后的题目文件 OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_filtered_high_quality_questions.json" # 筛选后的输出文件 ANALYSIS_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_selection_analysis.xlsx" # 分析报告 # 加载数据 print("正在加载已分类的题目...") with open(INPUT_FILE, 'r', encoding='utf-8') as f: all_questions = json.load(f) print(f"加载了 {len(all_questions)} 道题目") # 初始化筛选器 selector = QuestionFilterAndSelector() # 第一步:质量过滤 print("\n第一步:按质量标准过滤题目...") filtered_questions = selector.filter_questions_by_quality(all_questions) print(f"质量过滤后剩余: {len(filtered_questions)} 道题目") # 第二步:智能抽样 print("\n第二步:智能抽样构建评测集...") target_count = 2000 # 目标题目数 selected_questions = selector.smart_sample_questions(filtered_questions, target_count) print(f"最终选择: {len(selected_questions)} 道题目") # 保存结果 print(f"\n保存筛选结果到: {OUTPUT_FILE}") with open(OUTPUT_FILE, 'w', encoding='utf-8') as f: json.dump(selected_questions, f, ensure_ascii=False, indent=2) # 分析结果 print("\n分析筛选结果...") analysis_results = selector.analyze_selection_results(all_questions, selected_questions) selector.print_selection_report(analysis_results) # 导出详细分析 try: df_original = pd.DataFrame(all_questions) df_selected = pd.DataFrame(selected_questions) with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer: df_selected.to_excel(writer, sheet_name='筛选结果', index=False) # 统计对比 comparison_data = [] for metric in ['question_type', 'knowledge_level', 'difficulty']: orig_dist = df_original[metric].value_counts(normalize=True) * 100 sel_dist = df_selected[metric].value_counts(normalize=True) * 100 for category in set(orig_dist.index) | set(sel_dist.index): comparison_data.append({ '维度': metric, '类别': category, '原始占比': orig_dist.get(category, 0), '筛选后占比': sel_dist.get(category, 0), '变化': sel_dist.get(category, 0) - orig_dist.get(category, 0) }) pd.DataFrame(comparison_data).to_excel(writer, sheet_name='分布对比', index=False) print(f"详细分析已保存到: {ANALYSIS_FILE}") except ImportError: print("提示: 安装pandas和openpyxl可生成详细分析报告") print(f"\n✅ 筛选完成!") print(f"🎯 最终评测集: {len(selected_questions)} 道高质量题目") print(f"📈 平均难度提升: {analysis_results['difficulty_improvement']:.2f}") print(f"💾 结果文件: {OUTPUT_FILE}") if __name__ == "__main__": main_filter_questions()