调整分类代码和质量筛选代码

2025-05-29 11:52:51 +08:00
parent 2774a4450f
commit a28774f6f0
7 changed files with 133103 additions and 404 deletions
--- a/layer2/PGEE/code/step4_high_quality.py
+++ b/layer2/PGEE/code/step4_high_quality.py
@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
    def __init__(self):
        # 各题型的最低难度阈值（排除过于简单的题目）
        self.min_difficulty_thresholds = {
-            "calculation": 2,      # 计算题：排除难度1的基础计算
-            "short_answer": 2,     # 简答题：排除难度1的简单记忆
-            "true_false": 2,       # 判断题：排除难度1的基础概念判断
-            "multiple_choice": 2   # 选择题：排除难度1的简单选择
+            "calculation": 1,      # 计算题：排除难度1的基础计算
+            "short_answer": 1,     # 简答题：排除难度1的简单记忆
+            "true_false": 1,       # 判断题：排除难度1的基础概念判断
+            "multiple_choice": 1   # 选择题：排除难度1的简单选择
        }
        
        # 各知识层次的最低难度要求
        self.knowledge_level_min_difficulty = {
-            "basic_concepts": 2,      # 基础概念至少难度2（需要理解，不只是记忆）
-            "simple_application": 2,   # 简单应用至少难度2
+            "basic_concepts": 1,      # 基础概念至少难度2（需要理解，不只是记忆）
+            "simple_application": 1,   # 简单应用至少难度2
            "medium_application": 2,   # 中等应用至少难度2
            "complex_analysis": 3,     # 复杂分析至少难度3
            "advanced_synthesis": 4    # 高级综合至少难度4
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
                "advanced_synthesis": 0.05   # 5% 高级综合
            },
            "difficulty_levels": {
-                1: 0.05,  # 5% 难度1（仅保留最有价值的）
-                2: 0.25,  # 25% 难度2
+                1: 0.15,  # 5% 难度1（仅保留最有价值的）
+                2: 0.15,  # 25% 难度2
                3: 0.35,  # 35% 难度3
                4: 0.25,  # 25% 难度4
                5: 0.10   # 10% 难度5
            }
        }
    
+    def clean_text_for_excel(self, text: str) -> str:
+        """清理文本，移除Excel不支持的字符"""
+        if not isinstance(text, str):
+            return str(text)
+        
+        # 移除或替换Excel不支持的控制字符
+        # 保留常见的可打印字符
+        cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
+        
+        # 限制长度，避免Excel单元格过长
+        if len(cleaned) > 32767:  # Excel单元格字符限制
+            cleaned = cleaned[:32760] + "..."
+        
+        return cleaned
+
    def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
        """第一步：按质量标准过滤题目"""
        filtered_questions = []
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
            if difficulty < min_level_difficulty:
                continue
            
-            # 特殊过滤规则
-            if self._should_exclude_question(q):
-                continue
+            # # 特殊过滤规则
+            # if self._should_exclude_question(q):
+            #     continue
            
            filtered_questions.append(q)
        
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
        if len(q['question'].strip()) < 20:  # 题目太短
            return False
        
-        if len(q['answer'].strip()) < 5:  # 答案太短
-            return False
+        # if len(q['answer'].strip()) < 5:  # 答案太短
+        #     return False
        
        return True
    
@@ -469,14 +484,25 @@ def main_filter_questions():
    
    # 导出详细分析
    try:
-        df_original = pd.DataFrame(all_questions)
-        df_selected = pd.DataFrame(selected_questions)
+        # 清理数据中的特殊字符
+        cleaned_selected = []
+        for q in selected_questions:
+            cleaned_q = {}
+            for key, value in q.items():
+                cleaned_q[key] = selector.clean_text_for_excel(value)
+            cleaned_selected.append(cleaned_q)
+        
+        df_selected = pd.DataFrame(cleaned_selected)
        
        with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
-            df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
+            # df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
+            # 只保存关键字段到Excel，避免长文本问题
+            summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
+            summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
            
            # 统计对比
            comparison_data = []
+            df_original = pd.DataFrame(all_questions)
            for metric in ['question_type', 'knowledge_level', 'difficulty']:
                orig_dist = df_original[metric].value_counts(normalize=True) * 100
                sel_dist = df_selected[metric].value_counts(normalize=True) * 100