调整分类代码和质量筛选代码

This commit is contained in:
lzy
2025-05-29 11:52:51 +08:00
parent 2774a4450f
commit a28774f6f0
7 changed files with 133103 additions and 404 deletions

View File

@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
def __init__(self):
# 各题型的最低难度阈值(排除过于简单的题目)
self.min_difficulty_thresholds = {
"calculation": 2, # 计算题排除难度1的基础计算
"short_answer": 2, # 简答题排除难度1的简单记忆
"true_false": 2, # 判断题排除难度1的基础概念判断
"multiple_choice": 2 # 选择题排除难度1的简单选择
"calculation": 1, # 计算题排除难度1的基础计算
"short_answer": 1, # 简答题排除难度1的简单记忆
"true_false": 1, # 判断题排除难度1的基础概念判断
"multiple_choice": 1 # 选择题排除难度1的简单选择
}
# 各知识层次的最低难度要求
self.knowledge_level_min_difficulty = {
"basic_concepts": 2, # 基础概念至少难度2需要理解不只是记忆
"simple_application": 2, # 简单应用至少难度2
"basic_concepts": 1, # 基础概念至少难度2需要理解不只是记忆
"simple_application": 1, # 简单应用至少难度2
"medium_application": 2, # 中等应用至少难度2
"complex_analysis": 3, # 复杂分析至少难度3
"advanced_synthesis": 4 # 高级综合至少难度4
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
"advanced_synthesis": 0.05 # 5% 高级综合
},
"difficulty_levels": {
1: 0.05, # 5% 难度1仅保留最有价值的
2: 0.25, # 25% 难度2
1: 0.15, # 5% 难度1仅保留最有价值的
2: 0.15, # 25% 难度2
3: 0.35, # 35% 难度3
4: 0.25, # 25% 难度4
5: 0.10 # 10% 难度5
}
}
def clean_text_for_excel(self, text: str) -> str:
"""清理文本移除Excel不支持的字符"""
if not isinstance(text, str):
return str(text)
# 移除或替换Excel不支持的控制字符
# 保留常见的可打印字符
cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
# 限制长度避免Excel单元格过长
if len(cleaned) > 32767: # Excel单元格字符限制
cleaned = cleaned[:32760] + "..."
return cleaned
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
"""第一步:按质量标准过滤题目"""
filtered_questions = []
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
if difficulty < min_level_difficulty:
continue
# 特殊过滤规则
if self._should_exclude_question(q):
continue
# # 特殊过滤规则
# if self._should_exclude_question(q):
# continue
filtered_questions.append(q)
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
if len(q['question'].strip()) < 20: # 题目太短
return False
if len(q['answer'].strip()) < 5: # 答案太短
return False
# if len(q['answer'].strip()) < 5: # 答案太短
# return False
return True
@@ -469,14 +484,25 @@ def main_filter_questions():
# 导出详细分析
try:
df_original = pd.DataFrame(all_questions)
df_selected = pd.DataFrame(selected_questions)
# 清理数据中的特殊字符
cleaned_selected = []
for q in selected_questions:
cleaned_q = {}
for key, value in q.items():
cleaned_q[key] = selector.clean_text_for_excel(value)
cleaned_selected.append(cleaned_q)
df_selected = pd.DataFrame(cleaned_selected)
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
# df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
# 只保存关键字段到Excel避免长文本问题
summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
# 统计对比
comparison_data = []
df_original = pd.DataFrame(all_questions)
for metric in ['question_type', 'knowledge_level', 'difficulty']:
orig_dist = df_original[metric].value_counts(normalize=True) * 100
sel_dist = df_selected[metric].value_counts(normalize=True) * 100