调整分类代码和质量筛选代码

2025-05-29 11:52:51 +08:00
parent 2774a4450f
commit a28774f6f0
7 changed files with 133103 additions and 404 deletions
--- a/layer2/PGEE/code/step4.1_enhanced_classified_questions.json
+++ b/layer2/PGEE/code/step4.1_enhanced_classified_questions.json
--- a/layer2/PGEE/code/step4_enhanced_classified_questions.json
+++ b/layer2/PGEE/code/step4_enhanced_classified_questions.json
--- a/layer2/PGEE/code/step4_filter.py
+++ b/layer2/PGEE/code/step4_filter.py
@@ -788,26 +788,23 @@ class EnhancedQuestionClassifier:
            f"等级{level}: {desc}" for level, desc in criteria.items()
        ])
        
-        prompt = f"""请为以下题目在同题型内评估难度等级。
+        prompt = f"""请为以下题目在选择题型内评估难度等级。

 题目：{question}
-答案：{answer}
-题型：{type_info['name']} - {type_info['description']}
+正确选项：{answer}
 知识层次：{level_info['name']} - {level_info['description']}

-在该题型和知识层次下的难度等级标准：
+在选择题型和不同知识层次下的难度等级标准：
 {criteria_desc}

 重要说明：
- 难度评估必须在相同题型内进行比较
- 不同题型有不同的固有难度，需要排除题型本身的影响
- 重点关注在该题型框架内的相对难度
+- 难度评估必须在选择题型内进行比较（仅给定正确选项）

 评估考虑因素：
- 同类题型中的知识点掌握深度要求
- 同类题型中的解题步骤复杂程度
- 在该题型内的相对难度水平
- 对该题型能力的具体要求
+- 选择题型中的知识点掌握深度要求
+- 选择题型中的解题步骤复杂程度
+- 选择题型内的相对难度水平
+- 选择题型能力的具体要求

 请严格按照以下格式返回：
 难度：[等级数字]
@@ -1506,12 +1503,12 @@ def advanced_main():
        questions = classifier.load_questions_from_json(INPUT_FILE)
        import random
        random.shuffle(questions)  # 打乱题目顺序
-        questions = questions[:100]  # 测试时可以先处理一小部分
+        # questions = questions[:100]  # 测试时可以先处理一小部分

        print("开始三阶段分类处理...")
        classified_questions = classifier.classify_questions_batch(
            questions=questions,
-            max_workers=10,
+            max_workers=20,
            save_interval=10,
            output_file=OUTPUT_FILE
        )
--- a/layer2/PGEE/code/step4_filtered_high_quality_questions.json
+++ b/layer2/PGEE/code/step4_filtered_high_quality_questions.json
--- a/layer2/PGEE/code/step4_high_quality.py
+++ b/layer2/PGEE/code/step4_high_quality.py
@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
    def __init__(self):
        # 各题型的最低难度阈值（排除过于简单的题目）
        self.min_difficulty_thresholds = {
-            "calculation": 2,      # 计算题：排除难度1的基础计算
-            "short_answer": 2,     # 简答题：排除难度1的简单记忆
-            "true_false": 2,       # 判断题：排除难度1的基础概念判断
-            "multiple_choice": 2   # 选择题：排除难度1的简单选择
+            "calculation": 1,      # 计算题：排除难度1的基础计算
+            "short_answer": 1,     # 简答题：排除难度1的简单记忆
+            "true_false": 1,       # 判断题：排除难度1的基础概念判断
+            "multiple_choice": 1   # 选择题：排除难度1的简单选择
        }
        
        # 各知识层次的最低难度要求
        self.knowledge_level_min_difficulty = {
-            "basic_concepts": 2,      # 基础概念至少难度2（需要理解，不只是记忆）
-            "simple_application": 2,   # 简单应用至少难度2
+            "basic_concepts": 1,      # 基础概念至少难度2（需要理解，不只是记忆）
+            "simple_application": 1,   # 简单应用至少难度2
            "medium_application": 2,   # 中等应用至少难度2
            "complex_analysis": 3,     # 复杂分析至少难度3
            "advanced_synthesis": 4    # 高级综合至少难度4
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
                "advanced_synthesis": 0.05   # 5% 高级综合
            },
            "difficulty_levels": {
-                1: 0.05,  # 5% 难度1（仅保留最有价值的）
-                2: 0.25,  # 25% 难度2
+                1: 0.15,  # 5% 难度1（仅保留最有价值的）
+                2: 0.15,  # 25% 难度2
                3: 0.35,  # 35% 难度3
                4: 0.25,  # 25% 难度4
                5: 0.10   # 10% 难度5
            }
        }
    
+    def clean_text_for_excel(self, text: str) -> str:
+        """清理文本，移除Excel不支持的字符"""
+        if not isinstance(text, str):
+            return str(text)
+        
+        # 移除或替换Excel不支持的控制字符
+        # 保留常见的可打印字符
+        cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
+        
+        # 限制长度，避免Excel单元格过长
+        if len(cleaned) > 32767:  # Excel单元格字符限制
+            cleaned = cleaned[:32760] + "..."
+        
+        return cleaned
+
    def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
        """第一步：按质量标准过滤题目"""
        filtered_questions = []
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
            if difficulty < min_level_difficulty:
                continue
            
-            # 特殊过滤规则
-            if self._should_exclude_question(q):
-                continue
+            # # 特殊过滤规则
+            # if self._should_exclude_question(q):
+            #     continue
            
            filtered_questions.append(q)
        
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
        if len(q['question'].strip()) < 20:  # 题目太短
            return False
        
-        if len(q['answer'].strip()) < 5:  # 答案太短
-            return False
+        # if len(q['answer'].strip()) < 5:  # 答案太短
+        #     return False
        
        return True
    
@@ -469,14 +484,25 @@ def main_filter_questions():
    
    # 导出详细分析
    try:
-        df_original = pd.DataFrame(all_questions)
-        df_selected = pd.DataFrame(selected_questions)
+        # 清理数据中的特殊字符
+        cleaned_selected = []
+        for q in selected_questions:
+            cleaned_q = {}
+            for key, value in q.items():
+                cleaned_q[key] = selector.clean_text_for_excel(value)
+            cleaned_selected.append(cleaned_q)
+        
+        df_selected = pd.DataFrame(cleaned_selected)
        
        with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
-            df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
+            # df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
+            # 只保存关键字段到Excel，避免长文本问题
+            summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
+            summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
            
            # 统计对比
            comparison_data = []
+            df_original = pd.DataFrame(all_questions)
            for metric in ['question_type', 'knowledge_level', 'difficulty']:
                orig_dist = df_original[metric].value_counts(normalize=True) * 100
                sel_dist = df_selected[metric].value_counts(normalize=True) * 100
--- a/layer2/PGEE/code/step4_question_analysis.xlsx
+++ b/layer2/PGEE/code/step4_question_analysis.xlsx
--- a/layer2/PGEE/code/step4_selection_analysis.xlsx
+++ b/layer2/PGEE/code/step4_selection_analysis.xlsx