MatBench/layer2/PGEE/code/step4_high_quality.py

import json
import pandas as pd
from typing import List, Dict, Any, Tuple
from collections import defaultdict, Counter
import numpy as np

class QuestionFilterAndSelector:
    """题目筛选和选择器 - 用于构建高质量评测集"""

    def __init__(self):
        # 各题型的最低难度阈值（排除过于简单的题目）
        self.min_difficulty_thresholds = {
            "calculation": 2,      # 计算题：排除难度1的基础计算
            "short_answer": 2,     # 简答题：排除难度1的简单记忆
            "true_false": 2,       # 判断题：排除难度1的基础概念判断
            "multiple_choice": 2   # 选择题：排除难度1的简单选择
        }

        # 各知识层次的最低难度要求
        self.knowledge_level_min_difficulty = {
            "basic_concepts": 2,      # 基础概念至少难度2（需要理解，不只是记忆）
            "simple_application": 2,   # 简单应用至少难度2
            "medium_application": 2,   # 中等应用至少难度2
            "complex_analysis": 3,     # 复杂分析至少难度3
            "advanced_synthesis": 4    # 高级综合至少难度4
        }

        # 目标题库结构（百分比）
        self.target_distribution = {
            "question_types": {
                "calculation": 0.25,      # 25% 计算题
                "short_answer": 0.45,     # 45% 简答题
                "true_false": 0.15,       # 15% 判断题
                "multiple_choice": 0.15   # 15% 选择题
            },
            "knowledge_levels": {
                "basic_concepts": 0.20,      # 20% 基础概念（但要求难度2+）
                "simple_application": 0.25,  # 25% 简单应用
                "medium_application": 0.30,  # 30% 中等应用
                "complex_analysis": 0.20,    # 20% 复杂分析
                "advanced_synthesis": 0.05   # 5% 高级综合
            },
            "difficulty_levels": {
                1: 0.05,  # 5% 难度1（仅保留最有价值的）
                2: 0.25,  # 25% 难度2
                3: 0.35,  # 35% 难度3
                4: 0.25,  # 25% 难度4
                5: 0.10   # 10% 难度5
            }
        }

    def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
        """第一步：按质量标准过滤题目"""
        filtered_questions = []

        for q in questions:
            # 基础质量检查
            if not self._is_valid_question(q):
                continue

            question_type = q.get('question_type', '')
            knowledge_level = q.get('knowledge_level', '')
            difficulty = q.get('difficulty', 0)

            # 应用题型最低难度阈值
            min_type_difficulty = self.min_difficulty_thresholds.get(question_type, 1)
            if difficulty < min_type_difficulty:
                continue

            # 应用知识层次最低难度要求
            min_level_difficulty = self.knowledge_level_min_difficulty.get(knowledge_level, 1)
            if difficulty < min_level_difficulty:
                continue

            # 特殊过滤规则
            if self._should_exclude_question(q):
                continue

            filtered_questions.append(q)

        return filtered_questions

    def _is_valid_question(self, q: Dict) -> bool:
        """检查题目的基本有效性"""
        required_fields = ['question', 'answer', 'question_type', 'knowledge_level', 'difficulty']

        for field in required_fields:
            if not q.get(field):
                return False

        # 检查题目和答案长度（排除过短的）
        if len(q['question'].strip()) < 20:  # 题目太短
            return False

        if len(q['answer'].strip()) < 5:  # 答案太短
            return False

        return True

    def _should_exclude_question(self, q: Dict) -> bool:
        """特殊排除规则"""
        question_type = q.get('question_type', '')
        knowledge_level = q.get('knowledge_level', '')
        difficulty = q.get('difficulty', 0)

        # 排除一些特定的低质量组合
        exclude_combinations = [
            # 基础概念+难度1的组合（纯记忆）
            (knowledge_level == 'basic_concepts' and difficulty == 1),

            # 选择题+基础概念+低难度的组合
            (question_type == 'multiple_choice' and
             knowledge_level == 'basic_concepts' and difficulty <= 2),

            # 判断题+基础概念+难度1的组合
            (question_type == 'true_false' and
             knowledge_level == 'basic_concepts' and difficulty == 1),
        ]

        return any(exclude_combinations)

    def smart_sample_questions(self, filtered_questions: List[Dict],
                             target_count: int = 2000) -> List[Dict]:
        """智能抽样，保持分布平衡"""

        # 按类别分组
        grouped_questions = self._group_questions_by_categories(filtered_questions)

        # 计算目标数量
        target_counts = self._calculate_target_counts(target_count)

        # 分层抽样
        selected_questions = []

        # 1. 按题型分层抽样
        selected_by_type = self._stratified_sample_by_type(
            grouped_questions, target_counts, target_count
        )

        # 2. 在每个题型内按知识层次和难度平衡抽样
        final_selected = self._balance_within_types(selected_by_type, target_count)

        return final_selected

    def _group_questions_by_categories(self, questions: List[Dict]) -> Dict:
        """按多个维度对题目分组"""
        grouped = {
            'by_type': defaultdict(list),
            'by_level': defaultdict(list),
            'by_difficulty': defaultdict(list),
            'by_type_level': defaultdict(lambda: defaultdict(list)),
            'by_type_difficulty': defaultdict(lambda: defaultdict(list))
        }

        for q in questions:
            qtype = q['question_type']
            level = q['knowledge_level']
            difficulty = q['difficulty']

            grouped['by_type'][qtype].append(q)
            grouped['by_level'][level].append(q)
            grouped['by_difficulty'][difficulty].append(q)
            grouped['by_type_level'][qtype][level].append(q)
            grouped['by_type_difficulty'][qtype][difficulty].append(q)

        return grouped

    def _calculate_target_counts(self, total_target: int) -> Dict:
        """计算各类别的目标数量"""
        return {
            'by_type': {
                qtype: int(total_target * ratio)
                for qtype, ratio in self.target_distribution['question_types'].items()
            },
            'by_level': {
                level: int(total_target * ratio)
                for level, ratio in self.target_distribution['knowledge_levels'].items()
            },
            'by_difficulty': {
                diff: int(total_target * ratio)
                for diff, ratio in self.target_distribution['difficulty_levels'].items()
            }
        }

    def _stratified_sample_by_type(self, grouped_questions: Dict,
                                 target_counts: Dict, total_target: int) -> Dict:
        """按题型分层抽样"""
        selected_by_type = {}

        for qtype, target_count in target_counts['by_type'].items():
            available_questions = grouped_questions['by_type'].get(qtype, [])

            if len(available_questions) <= target_count:
                # 如果可用题目不足，全部选择
                selected_by_type[qtype] = available_questions
            else:
                # 在该题型内进行智能抽样
                selected_by_type[qtype] = self._smart_sample_within_type(
                    available_questions, target_count
                )

        return selected_by_type

    def _smart_sample_within_type(self, questions: List[Dict], target_count: int) -> List[Dict]:
        """在单一题型内智能抽样"""
        # 按难度和知识层次分组
        by_difficulty = defaultdict(list)
        by_level = defaultdict(list)

        for q in questions:
            by_difficulty[q['difficulty']].append(q)
            by_level[q['knowledge_level']].append(q)

        selected = []

        # 优先选择高难度题目
        difficulty_priorities = [5, 4, 3, 2, 1]

        remaining_target = target_count

        for difficulty in difficulty_priorities:
            if remaining_target <= 0:
                break

            diff_questions = by_difficulty[difficulty]
            if not diff_questions:
                continue

            # 在该难度级别内按知识层次平衡选择
            level_groups = defaultdict(list)
            for q in diff_questions:
                level_groups[q['knowledge_level']].append(q)

            # 计算该难度级别应该选多少题
            target_for_this_diff = min(remaining_target,
                                     int(remaining_target * 0.4) if difficulty >= 4
                                     else int(remaining_target * 0.3))

            # 在各知识层次间分配
            selected_from_diff = self._distribute_across_levels(
                level_groups, target_for_this_diff
            )

            selected.extend(selected_from_diff)
            remaining_target -= len(selected_from_diff)

        # 如果还没达到目标数量，随机补充
        if len(selected) < target_count:
            remaining_questions = [q for q in questions if q not in selected]
            additional_needed = target_count - len(selected)

            if remaining_questions:
                import random
                additional = random.sample(
                    remaining_questions,
                    min(additional_needed, len(remaining_questions))
                )
                selected.extend(additional)

        return selected[:target_count]

    def _distribute_across_levels(self, level_groups: Dict, target_count: int) -> List[Dict]:
        """在知识层次间分配题目"""
        if not level_groups or target_count <= 0:
            return []

        selected = []

        # 知识层次优先级（优先选择更高层次的）
        level_priorities = [
            'advanced_synthesis',
            'complex_analysis',
            'medium_application',
            'simple_application',
            'basic_concepts'
        ]

        # 为每个层次分配配额
        level_quotas = {}
        remaining_target = target_count

        for level in level_priorities:
            if level not in level_groups or remaining_target <= 0:
                continue

            available_count = len(level_groups[level])

            if level in ['advanced_synthesis', 'complex_analysis']:
                quota = min(available_count, max(1, int(remaining_target * 0.4)))
            elif level == 'medium_application':
                quota = min(available_count, max(1, int(remaining_target * 0.3)))
            else:
                quota = min(available_count, max(1, int(remaining_target * 0.2)))

            level_quotas[level] = quota
            remaining_target -= quota

        # 按配额选择
        import random
        for level, quota in level_quotas.items():
            if quota > 0 and level in level_groups:
                sample_size = min(quota, len(level_groups[level]))
                selected.extend(random.sample(level_groups[level], sample_size))

        return selected

    def _balance_within_types(self, selected_by_type: Dict, target_count: int) -> List[Dict]:
        """在题型选择结果间进行最终平衡"""
        all_selected = []
        for questions in selected_by_type.values():
            all_selected.extend(questions)

        # 如果总数超过目标，需要进一步筛选
        if len(all_selected) > target_count:
            # 按综合质量评分排序
            scored_questions = [(q, self._calculate_quality_score(q)) for q in all_selected]
            scored_questions.sort(key=lambda x: x[1], reverse=True)

            all_selected = [q for q, score in scored_questions[:target_count]]

        return all_selected

    def _calculate_quality_score(self, question: Dict) -> float:
        """计算题目质量评分"""
        score = 0.0

        # 难度权重
        difficulty = question.get('difficulty', 1)
        score += difficulty * 2.0

        # 知识层次权重
        level_weights = {
            'basic_concepts': 1.0,
            'simple_application': 2.0,
            'medium_application': 3.0,
            'complex_analysis': 4.0,
            'advanced_synthesis': 5.0
        }
        score += level_weights.get(question.get('knowledge_level', ''), 1.0)

        # 题型权重（简答题和计算题更有价值）
        type_weights = {
            'short_answer': 2.0,
            'calculation': 2.0,
            'multiple_choice': 1.5,
            'true_false': 1.0
        }
        score += type_weights.get(question.get('question_type', ''), 1.0)

        # 题目长度权重（更详细的题目更有价值）
        question_length = len(question.get('question', ''))
        if question_length > 100:
            score += 1.0
        elif question_length > 200:
            score += 2.0

        return score

    def analyze_selection_results(self, original_questions: List[Dict],
                                selected_questions: List[Dict]) -> Dict:
        """分析选择结果"""
        def get_distribution(questions):
            total = len(questions)
            if total == 0:
                return {}

            dist = {
                'total': total,
                'by_type': Counter(q.get('question_type', '') for q in questions),
                'by_level': Counter(q.get('knowledge_level', '') for q in questions),
                'by_difficulty': Counter(q.get('difficulty', 0) for q in questions),
                'avg_difficulty': np.mean([q.get('difficulty', 0) for q in questions])
            }

            # 转换为百分比
            for key in ['by_type', 'by_level', 'by_difficulty']:
                dist[key + '_pct'] = {
                    k: v/total*100 for k, v in dist[key].items()
                }

            return dist

        original_dist = get_distribution(original_questions)
        selected_dist = get_distribution(selected_questions)

        return {
            'original': original_dist,
            'selected': selected_dist,
            'selection_ratio': len(selected_questions) / len(original_questions) if original_questions else 0,
            'difficulty_improvement': selected_dist['avg_difficulty'] - original_dist['avg_difficulty']
        }

    def print_selection_report(self, analysis_results: Dict):
        """打印选择报告"""
        print("\n" + "="*60)
        print("题目筛选结果报告")
        print("="*60)

        original = analysis_results['original']
        selected = analysis_results['selected']

        print(f"\n📊 基本统计:")
        print(f"原始题目数: {original['total']}")
        print(f"筛选后题目数: {selected['total']}")
        print(f"筛选比例: {analysis_results['selection_ratio']:.1%}")
        print(f"平均难度提升: {analysis_results['difficulty_improvement']:.2f}")

        print(f"\n📈 题型分布对比:")
        print(f"{'题型':<15} {'原始':<10} {'筛选后':<10} {'变化':<10}")
        print("-" * 50)

        for qtype in ['calculation', 'short_answer', 'true_false', 'multiple_choice']:
            orig_pct = original['by_type_pct'].get(qtype, 0)
            sel_pct = selected['by_type_pct'].get(qtype, 0)
            change = sel_pct - orig_pct

            print(f"{qtype:<15} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%")

        print(f"\n🎯 难度分布对比:")
        print(f"{'难度':<8} {'原始':<10} {'筛选后':<10} {'变化':<10}")
        print("-" * 40)

        for diff in range(1, 6):
            orig_pct = original['by_difficulty_pct'].get(diff, 0)
            sel_pct = selected['by_difficulty_pct'].get(diff, 0)
            change = sel_pct - orig_pct

            print(f"难度{diff:<3} {orig_pct:>7.1f}% {sel_pct:>8.1f}% {change:>+7.1f}%")


def main_filter_questions():
    """主函数：筛选高质量题目"""

    # 文件路径
    INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_enhanced_classified_questions.json"  # 分类后的题目文件
    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_filtered_high_quality_questions.json"  # 筛选后的输出文件
    ANALYSIS_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step4_selection_analysis.xlsx"  # 分析报告

    # 加载数据
    print("正在加载已分类的题目...")
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        all_questions = json.load(f)

    print(f"加载了 {len(all_questions)} 道题目")

    # 初始化筛选器
    selector = QuestionFilterAndSelector()

    # 第一步：质量过滤
    print("\n第一步：按质量标准过滤题目...")
    filtered_questions = selector.filter_questions_by_quality(all_questions)
    print(f"质量过滤后剩余: {len(filtered_questions)} 道题目")

    # 第二步：智能抽样
    print("\n第二步：智能抽样构建评测集...")
    target_count = 2000  # 目标题目数
    selected_questions = selector.smart_sample_questions(filtered_questions, target_count)
    print(f"最终选择: {len(selected_questions)} 道题目")

    # 保存结果
    print(f"\n保存筛选结果到: {OUTPUT_FILE}")
    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
        json.dump(selected_questions, f, ensure_ascii=False, indent=2)

    # 分析结果
    print("\n分析筛选结果...")
    analysis_results = selector.analyze_selection_results(all_questions, selected_questions)
    selector.print_selection_report(analysis_results)

    # 导出详细分析
    try:
        df_original = pd.DataFrame(all_questions)
        df_selected = pd.DataFrame(selected_questions)

        with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
            df_selected.to_excel(writer, sheet_name='筛选结果', index=False)

            # 统计对比
            comparison_data = []
            for metric in ['question_type', 'knowledge_level', 'difficulty']:
                orig_dist = df_original[metric].value_counts(normalize=True) * 100
                sel_dist = df_selected[metric].value_counts(normalize=True) * 100

                for category in set(orig_dist.index) | set(sel_dist.index):
                    comparison_data.append({
                        '维度': metric,
                        '类别': category,
                        '原始占比': orig_dist.get(category, 0),
                        '筛选后占比': sel_dist.get(category, 0),
                        '变化': sel_dist.get(category, 0) - orig_dist.get(category, 0)
                    })

            pd.DataFrame(comparison_data).to_excel(writer, sheet_name='分布对比', index=False)

        print(f"详细分析已保存到: {ANALYSIS_FILE}")

    except ImportError:
        print("提示: 安装pandas和openpyxl可生成详细分析报告")

    print(f"\n✅ 筛选完成！")
    print(f"🎯 最终评测集: {len(selected_questions)} 道高质量题目")
    print(f"📈 平均难度提升: {analysis_results['difficulty_improvement']:.2f}")
    print(f"💾 结果文件: {OUTPUT_FILE}")

if __name__ == "__main__":
    main_filter_questions()