507 lines
20 KiB
Python
507 lines
20 KiB
Python
import json
|
||
import pandas as pd
|
||
from typing import List, Dict, Any, Tuple
|
||
from collections import defaultdict, Counter
|
||
import numpy as np
|
||
|
||
class QuestionFilterAndSelector:
|
||
"""题目筛选和选择器 - 用于构建高质量评测集"""
|
||
|
||
def __init__(self):
|
||
# 各题型的最低难度阈值(排除过于简单的题目)
|
||
self.min_difficulty_thresholds = {
|
||
"calculation": 2, # 计算题:排除难度1的基础计算
|
||
"short_answer": 2, # 简答题:排除难度1的简单记忆
|
||
"true_false": 2, # 判断题:排除难度1的基础概念判断
|
||
"multiple_choice": 2 # 选择题:排除难度1的简单选择
|
||
}
|
||
|
||
# 各知识层次的最低难度要求
|
||
self.knowledge_level_min_difficulty = {
|
||
"basic_concepts": 2, # 基础概念至少难度2(需要理解,不只是记忆)
|
||
"simple_application": 2, # 简单应用至少难度2
|
||
"medium_application": 2, # 中等应用至少难度2
|
||
"complex_analysis": 3, # 复杂分析至少难度3
|
||
"advanced_synthesis": 4 # 高级综合至少难度4
|
||
}
|
||
|
||
# 目标题库结构(百分比)
|
||
self.target_distribution = {
|
||
"question_types": {
|
||
"calculation": 0.25, # 25% 计算题
|
||
"short_answer": 0.45, # 45% 简答题
|
||
"true_false": 0.15, # 15% 判断题
|
||
"multiple_choice": 0.15 # 15% 选择题
|
||
},
|
||
"knowledge_levels": {
|
||
"basic_concepts": 0.20, # 20% 基础概念(但要求难度2+)
|
||
"simple_application": 0.25, # 25% 简单应用
|
||
"medium_application": 0.30, # 30% 中等应用
|
||
"complex_analysis": 0.20, # 20% 复杂分析
|
||
"advanced_synthesis": 0.05 # 5% 高级综合
|
||
},
|
||
"difficulty_levels": {
|
||
1: 0.05, # 5% 难度1(仅保留最有价值的)
|
||
2: 0.25, # 25% 难度2
|
||
3: 0.35, # 35% 难度3
|
||
4: 0.25, # 25% 难度4
|
||
5: 0.10 # 10% 难度5
|
||
}
|
||
}
|
||
|
||
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
|
||
"""第一步:按质量标准过滤题目"""
|
||
filtered_questions = []
|
||
|
||
for q in questions:
|
||
# 基础质量检查
|
||
if not self._is_valid_question(q):
|
||
continue
|
||
|
||
question_type = q.get('question_type', '')
|
||
knowledge_level = q.get('knowledge_level', '')
|
||
difficulty = q.get('difficulty', 0)
|
||
|
||
# 应用题型最低难度阈值
|
||
min_type_difficulty = self.min_difficulty_thresholds.get(question_type, 1)
|
||
if difficulty < min_type_difficulty:
|
||
continue
|
||
|
||
# 应用知识层次最低难度要求
|
||
min_level_difficulty = self.knowledge_level_min_difficulty.get(knowledge_level, 1)
|
||
if difficulty < min_level_difficulty:
|
||
continue
|
||
|
||
# 特殊过滤规则
|
||
if self._should_exclude_question(q):
|
||
continue
|
||
|
||
filtered_questions.append(q)
|
||
|
||
return filtered_questions
|
||
|
||
def _is_valid_question(self, q: Dict) -> bool:
|
||
"""检查题目的基本有效性"""
|
||
required_fields = ['question', 'answer', 'question_type', 'knowledge_level', 'difficulty']
|
||
|
||
for field in required_fields:
|
||
if not q.get(field):
|
||
return False
|
||
|
||
# 检查题目和答案长度(排除过短的)
|
||
if len(q['question'].strip()) < 20: # 题目太短
|
||
return False
|
||
|
||
if len(q['answer'].strip()) < 5: # 答案太短
|
||
return False
|
||
|
||
return True
|
||
|
||
def _should_exclude_question(self, q: Dict) -> bool:
|
||
"""特殊排除规则"""
|
||
question_type = q.get('question_type', '')
|
||
knowledge_level = q.get('knowledge_level', '')
|
||
difficulty = q.get('difficulty', 0)
|
||
|
||
# 排除一些特定的低质量组合
|
||
exclude_combinations = [
|
||
# 基础概念+难度1的组合(纯记忆)
|
||
(knowledge_level == 'basic_concepts' and difficulty == 1),
|
||
|
||
# 选择题+基础概念+低难度的组合
|
||
(question_type == 'multiple_choice' and
|
||
knowledge_level == 'basic_concepts' and difficulty <= 2),
|
||
|
||
# 判断题+基础概念+难度1的组合
|
||
(question_type == 'true_false' and
|
||
knowledge_level == 'basic_concepts' and difficulty == 1),
|
||
]
|
||
|
||
return any(exclude_combinations)
|
||
|
||
def smart_sample_questions(self, filtered_questions: List[Dict],
|
||
target_count: int = 2000) -> List[Dict]:
|
||
"""智能抽样,保持分布平衡"""
|
||
|
||
# 按类别分组
|
||
grouped_questions = self._group_questions_by_categories(filtered_questions)
|
||
|
||
# 计算目标数量
|
||
target_counts = self._calculate_target_counts(target_count)
|
||
|
||
# 分层抽样
|
||
selected_questions = []
|
||
|
||
# 1. 按题型分层抽样
|
||
selected_by_type = self._stratified_sample_by_type(
|
||
grouped_questions, target_counts, target_count
|
||
)
|
||
|
||
# 2. 在每个题型内按知识层次和难度平衡抽样
|
||
final_selected = self._balance_within_types(selected_by_type, target_count)
|
||
|
||
return final_selected
|
||
|
||
def _group_questions_by_categories(self, questions: List[Dict]) -> Dict:
|
||
"""按多个维度对题目分组"""
|
||
grouped = {
|
||
'by_type': defaultdict(list),
|
||
'by_level': defaultdict(list),
|
||
'by_difficulty': defaultdict(list),
|
||
'by_type_level': defaultdict(lambda: defaultdict(list)),
|
||
'by_type_difficulty': defaultdict(lambda: defaultdict(list))
|
||
}
|
||
|
||
for q in questions:
|
||
qtype = q['question_type']
|
||
level = q['knowledge_level']
|
||
difficulty = q['difficulty']
|
||
|
||
grouped['by_type'][qtype].append(q)
|
||
grouped['by_level'][level].append(q)
|
||
grouped['by_difficulty'][difficulty].append(q)
|
||
grouped['by_type_level'][qtype][level].append(q)
|
||
grouped['by_type_difficulty'][qtype][difficulty].append(q)
|
||
|
||
return grouped
|
||
|
||
def _calculate_target_counts(self, total_target: int) -> Dict:
|
||
"""计算各类别的目标数量"""
|
||
return {
|
||
'by_type': {
|
||
qtype: int(total_target * ratio)
|
||
for qtype, ratio in self.target_distribution['question_types'].items()
|
||
},
|
||
'by_level': {
|
||
level: int(total_target * ratio)
|
||
for level, ratio in self.target_distribution['knowledge_levels'].items()
|
||
},
|
||
'by_difficulty': {
|
||
diff: int(total_target * ratio)
|
||
for diff, ratio in self.target_distribution['difficulty_levels'].items()
|
||
}
|
||
}
|
||
|
||
def _stratified_sample_by_type(self, grouped_questions: Dict,
|
||
target_counts: Dict, total_target: int) -> Dict:
|
||
"""按题型分层抽样"""
|
||
selected_by_type = {}
|
||
|
||
for qtype, target_count in target_counts['by_type'].items():
|
||
available_questions = grouped_questions['by_type'].get(qtype, [])
|
||
|
||
if len(available_questions) <= target_count:
|
||
# 如果可用题目不足,全部选择
|
||
selected_by_type[qtype] = available_questions
|
||
else:
|
||
# 在该题型内进行智能抽样
|
||
selected_by_type[qtype] = self._smart_sample_within_type(
|
||
available_questions, target_count
|
||
)
|
||
|
||
return selected_by_type
|
||
|
||
def _smart_sample_within_type(self, questions: List[Dict], target_count: int) -> List[Dict]:
|
||
"""在单一题型内智能抽样"""
|
||
# 按难度和知识层次分组
|
||
by_difficulty = defaultdict(list)
|
||
by_level = defaultdict(list)
|
||
|
||
for q in questions:
|
||
by_difficulty[q['difficulty']].append(q)
|
||
by_level[q['knowledge_level']].append(q)
|
||
|
||
selected = []
|
||
|
||
# 优先选择高难度题目
|
||
difficulty_priorities = [5, 4, 3, 2, 1]
|
||
|
||
remaining_target = target_count
|
||
|
||
for difficulty in difficulty_priorities:
|
||
if remaining_target <= 0:
|
||
break
|
||
|
||
diff_questions = by_difficulty[difficulty]
|
||
if not diff_questions:
|
||
continue
|
||
|
||
# 在该难度级别内按知识层次平衡选择
|
||
level_groups = defaultdict(list)
|
||
for q in diff_questions:
|
||
level_groups[q['knowledge_level']].append(q)
|
||
|
||
# 计算该难度级别应该选多少题
|
||
target_for_this_diff = min(remaining_target,
|
||
int(remaining_target * 0.4) if difficulty >= 4
|
||
else int(remaining_target * 0.3))
|
||
|
||
# 在各知识层次间分配
|
||
selected_from_diff = self._distribute_across_levels(
|
||
level_groups, target_for_this_diff
|
||
)
|
||
|
||
selected.extend(selected_from_diff)
|
||
remaining_target -= len(selected_from_diff)
|
||
|
||
# 如果还没达到目标数量,随机补充
|
||
if len(selected) < target_count:
|
||
remaining_questions = [q for q in questions if q not in selected]
|
||
additional_needed = target_count - len(selected)
|
||
|
||
if remaining_questions:
|
||
import random
|
||
additional = random.sample(
|
||
remaining_questions,
|
||
min(additional_needed, len(remaining_questions))
|
||
)
|
||
selected.extend(additional)
|
||
|
||
return selected[:target_count]
|
||
|
||
def _distribute_across_levels(self, level_groups: Dict, target_count: int) -> List[Dict]:
|
||
"""在知识层次间分配题目"""
|
||
if not level_groups or target_count <= 0:
|
||
return []
|
||
|
||
selected = []
|
||
|
||
# 知识层次优先级(优先选择更高层次的)
|
||
level_priorities = [
|
||
'advanced_synthesis',
|
||
'complex_analysis',
|
||
'medium_application',
|
||
'simple_application',
|
||
'basic_concepts'
|
||
]
|
||
|
||
# 为每个层次分配配额
|
||
level_quotas = {}
|
||
remaining_target = target_count
|
||
|
||
for level in level_priorities:
|
||
if level not in level_groups or remaining_target <= 0:
|
||
continue
|
||
|
||
available_count = len(level_groups[level])
|
||
|
||
if level in ['advanced_synthesis', 'complex_analysis']:
|
||
quota = min(available_count, max(1, int(remaining_target * 0.4)))
|
||
elif level == 'medium_application':
|
||
quota = min(available_count, max(1, int(remaining_target * 0.3)))
|
||
else:
|
||
quota = min(available_count, max(1, int(remaining_target * 0.2)))
|
||
|
||
level_quotas[level] = quota
|
||
remaining_target -= quota
|
||
|
||
# 按配额选择
|
||
import random
|
||
for level, quota in level_quotas.items():
|
||
if quota > 0 and level in level_groups:
|
||
sample_size = min(quota, len(level_groups[level]))
|
||
selected.extend(random.sample(level_groups[level], sample_size))
|
||
|
||
return selected
|
||
|
||
def _balance_within_types(self, selected_by_type: Dict, target_count: int) -> List[Dict]:
|
||
"""在题型选择结果间进行最终平衡"""
|
||
all_selected = []
|
||
for questions in selected_by_type.values():
|
||
all_selected.extend(questions)
|
||
|
||
# 如果总数超过目标,需要进一步筛选
|
||
if len(all_selected) > target_count:
|
||
# 按综合质量评分排序
|
||
scored_questions = [(q, self._calculate_quality_score(q)) for q in all_selected]
|
||
scored_questions.sort(key=lambda x: x[1], reverse=True)
|
||
|
||
all_selected = [q for q, score in scored_questions[:target_count]]
|
||
|
||
return all_selected
|
||
|
||
def _calculate_quality_score(self, question: Dict) -> float:
|
||
"""计算题目质量评分"""
|
||
score = 0.0
|
||
|
||
# 难度权重
|
||
difficulty = question.get('difficulty', 1)
|
||
score += difficulty * 2.0
|
||
|
||
# 知识层次权重
|
||
level_weights = {
|
||
'basic_concepts': 1.0,
|
||
'simple_application': 2.0,
|
||
'medium_application': 3.0,
|
||
'complex_analysis': 4.0,
|
||
'advanced_synthesis': 5.0
|
||
}
|
||
score += level_weights.get(question.get('knowledge_level', ''), 1.0)
|
||
|
||
# 题型权重(简答题和计算题更有价值)
|
||
type_weights = {
|
||
'short_answer': 2.0,
|
||
'calculation': 2.0,
|
||
'multiple_choice': 1.5,
|
||
'true_false': 1.0
|
||
}
|
||
score += type_weights.get(question.get('question_type', ''), 1.0)
|
||
|
||
# 题目长度权重(更详细的题目更有价值)
|
||
question_length = len(question.get('question', ''))
|
||
if question_length > 100:
|
||
score += 1.0
|
||
elif question_length > 200:
|
||
score += 2.0
|
||
|
||
return score
|
||
|
||
def analyze_selection_results(self, original_questions: List[Dict],
|
||
selected_questions: List[Dict]) -> Dict:
|
||
"""分析选择结果"""
|
||
def get_distribution(questions):
|
||
total = len(questions)
|
||
if total == 0:
|
||
return {}
|
||
|
||
dist = {
|
||
'total': total,
|
||
'by_type': Counter(q.get('question_type', '') for q in questions),
|
||
'by_level': Counter(q.get('knowledge_level', '') for q in questions),
|
||
'by_difficulty': Counter(q.get('difficulty', 0) for q in questions),
|
||
'avg_difficulty': np.mean([q.get('difficulty', 0) for q in questions])
|
||
}
|
||
|
||
# 转换为百分比
|
||
for key in ['by_type', 'by_level', 'by_difficulty']:
|
||
dist[key + '_pct'] = {
|
||
k: v/total*100 for k, v in dist[key].items()
|
||
}
|
||
|
||
return dist
|
||
|
||
original_dist = get_distribution(original_questions)
|
||
selected_dist = get_distribution(selected_questions)
|
||
|
||
return {
|
||
'original': original_dist,
|
||
'selected': selected_dist,
|
||
'selection_ratio': len(selected_questions) / len(original_questions) if original_questions else 0,
|
||
'difficulty_improvement': selected_dist['avg_difficulty'] - original_dist['avg_difficulty']
|
||
}
|
||
|
||
def print_selection_report(self, analysis_results: Dict):
|
||
"""打印选择报告"""
|
||
print("\n" + "="*60)
|
||
print("题目筛选结果报告")
|
||
print("="*60)
|
||
|
||
original = analysis_results['original']
|
||
selected = analysis_results['selected']
|
||
|
||
print(f"\n📊 基本统计:")
|
||
print(f"原始题目数: {original['total']}")
|
||
print(f"筛选后题目数: {selected['total']}")
|
||
print(f"筛选比例: {analysis_results['selection_ratio']:.1%}")
|
||
print(f"平均难度提升: {analysis_results['difficulty_improvement']:.2f}")
|
||
|
||
print(f"\n📈 题型分布对比:")
|
||
print(f"{'题型':<15} {'原始':<10} {'筛选后':<10} {'变化':<10}")
|
||
print("-" * 50)
|
||
|
||
for qtype in ['calculation', 'short_answer', 'true_false', 'multiple_choice']:
|
||
orig_pct = original['by_type_pct'].get(qtype, 0)
|
||
sel_pct = selected['by_type_pct'].get(qtype, 0)
|
||
change = sel_pct - orig_pct
|
||
|
||
print(f"{qtype:<15} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%")
|
||
|
||
print(f"\n🎯 难度分布对比:")
|
||
print(f"{'难度':<8} {'原始':<10} {'筛选后':<10} {'变化':<10}")
|
||
print("-" * 40)
|
||
|
||
for diff in range(1, 6):
|
||
orig_pct = original['by_difficulty_pct'].get(diff, 0)
|
||
sel_pct = selected['by_difficulty_pct'].get(diff, 0)
|
||
change = sel_pct - orig_pct
|
||
|
||
print(f"难度{diff:<3} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%")
|
||
|
||
|
||
def main_filter_questions():
|
||
"""主函数:筛选高质量题目"""
|
||
|
||
# 文件路径
|
||
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_enhanced_classified_questions.json" # 分类后的题目文件
|
||
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_filtered_high_quality_questions.json" # 筛选后的输出文件
|
||
ANALYSIS_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_selection_analysis.xlsx" # 分析报告
|
||
|
||
# 加载数据
|
||
print("正在加载已分类的题目...")
|
||
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
|
||
all_questions = json.load(f)
|
||
|
||
print(f"加载了 {len(all_questions)} 道题目")
|
||
|
||
# 初始化筛选器
|
||
selector = QuestionFilterAndSelector()
|
||
|
||
# 第一步:质量过滤
|
||
print("\n第一步:按质量标准过滤题目...")
|
||
filtered_questions = selector.filter_questions_by_quality(all_questions)
|
||
print(f"质量过滤后剩余: {len(filtered_questions)} 道题目")
|
||
|
||
# 第二步:智能抽样
|
||
print("\n第二步:智能抽样构建评测集...")
|
||
target_count = 2000 # 目标题目数
|
||
selected_questions = selector.smart_sample_questions(filtered_questions, target_count)
|
||
print(f"最终选择: {len(selected_questions)} 道题目")
|
||
|
||
# 保存结果
|
||
print(f"\n保存筛选结果到: {OUTPUT_FILE}")
|
||
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
|
||
json.dump(selected_questions, f, ensure_ascii=False, indent=2)
|
||
|
||
# 分析结果
|
||
print("\n分析筛选结果...")
|
||
analysis_results = selector.analyze_selection_results(all_questions, selected_questions)
|
||
selector.print_selection_report(analysis_results)
|
||
|
||
# 导出详细分析
|
||
try:
|
||
df_original = pd.DataFrame(all_questions)
|
||
df_selected = pd.DataFrame(selected_questions)
|
||
|
||
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
|
||
df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
|
||
|
||
# 统计对比
|
||
comparison_data = []
|
||
for metric in ['question_type', 'knowledge_level', 'difficulty']:
|
||
orig_dist = df_original[metric].value_counts(normalize=True) * 100
|
||
sel_dist = df_selected[metric].value_counts(normalize=True) * 100
|
||
|
||
for category in set(orig_dist.index) | set(sel_dist.index):
|
||
comparison_data.append({
|
||
'维度': metric,
|
||
'类别': category,
|
||
'原始占比': orig_dist.get(category, 0),
|
||
'筛选后占比': sel_dist.get(category, 0),
|
||
'变化': sel_dist.get(category, 0) - orig_dist.get(category, 0)
|
||
})
|
||
|
||
pd.DataFrame(comparison_data).to_excel(writer, sheet_name='分布对比', index=False)
|
||
|
||
print(f"详细分析已保存到: {ANALYSIS_FILE}")
|
||
|
||
except ImportError:
|
||
print("提示: 安装pandas和openpyxl可生成详细分析报告")
|
||
|
||
print(f"\n✅ 筛选完成!")
|
||
print(f"🎯 最终评测集: {len(selected_questions)} 道高质量题目")
|
||
print(f"📈 平均难度提升: {analysis_results['difficulty_improvement']:.2f}")
|
||
print(f"💾 结果文件: {OUTPUT_FILE}")
|
||
|
||
if __name__ == "__main__":
|
||
main_filter_questions()
|