调整分类代码和质量筛选代码

This commit is contained in:
lzy
2025-05-29 11:52:51 +08:00
parent 2774a4450f
commit a28774f6f0
7 changed files with 133103 additions and 404 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -788,26 +788,23 @@ class EnhancedQuestionClassifier:
f"等级{level}: {desc}" for level, desc in criteria.items()
])
prompt = f"""请为以下题目在题型内评估难度等级。
prompt = f"""请为以下题目在选择题型内评估难度等级。
题目:{question}
答案{answer}
题型:{type_info['name']} - {type_info['description']}
正确选项{answer}
知识层次:{level_info['name']} - {level_info['description']}
题型和知识层次下的难度等级标准:
选择题型和不同知识层次下的难度等级标准:
{criteria_desc}
重要说明:
- 难度评估必须在相同题型内进行比较
- 不同题型有不同的固有难度,需要排除题型本身的影响
- 重点关注在该题型框架内的相对难度
- 难度评估必须在选择题型内进行比较(仅给定正确选项)
评估考虑因素:
- 同类题型中的知识点掌握深度要求
- 同类题型中的解题步骤复杂程度
- 在该题型内的相对难度水平
- 对该题型能力的具体要求
- 选择题型中的知识点掌握深度要求
- 选择题型中的解题步骤复杂程度
- 选择题型内的相对难度水平
- 选择题型能力的具体要求
请严格按照以下格式返回:
难度:[等级数字]
@@ -1506,12 +1503,12 @@ def advanced_main():
questions = classifier.load_questions_from_json(INPUT_FILE)
import random
random.shuffle(questions) # 打乱题目顺序
questions = questions[:100] # 测试时可以先处理一小部分
# questions = questions[:100] # 测试时可以先处理一小部分
print("开始三阶段分类处理...")
classified_questions = classifier.classify_questions_batch(
questions=questions,
max_workers=10,
max_workers=20,
save_interval=10,
output_file=OUTPUT_FILE
)

File diff suppressed because one or more lines are too long

View File

@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
def __init__(self):
# 各题型的最低难度阈值(排除过于简单的题目)
self.min_difficulty_thresholds = {
"calculation": 2, # 计算题排除难度1的基础计算
"short_answer": 2, # 简答题排除难度1的简单记忆
"true_false": 2, # 判断题排除难度1的基础概念判断
"multiple_choice": 2 # 选择题排除难度1的简单选择
"calculation": 1, # 计算题排除难度1的基础计算
"short_answer": 1, # 简答题排除难度1的简单记忆
"true_false": 1, # 判断题排除难度1的基础概念判断
"multiple_choice": 1 # 选择题排除难度1的简单选择
}
# 各知识层次的最低难度要求
self.knowledge_level_min_difficulty = {
"basic_concepts": 2, # 基础概念至少难度2需要理解不只是记忆
"simple_application": 2, # 简单应用至少难度2
"basic_concepts": 1, # 基础概念至少难度2需要理解不只是记忆
"simple_application": 1, # 简单应用至少难度2
"medium_application": 2, # 中等应用至少难度2
"complex_analysis": 3, # 复杂分析至少难度3
"advanced_synthesis": 4 # 高级综合至少难度4
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
"advanced_synthesis": 0.05 # 5% 高级综合
},
"difficulty_levels": {
1: 0.05, # 5% 难度1仅保留最有价值的
2: 0.25, # 25% 难度2
1: 0.15, # 5% 难度1仅保留最有价值的
2: 0.15, # 25% 难度2
3: 0.35, # 35% 难度3
4: 0.25, # 25% 难度4
5: 0.10 # 10% 难度5
}
}
def clean_text_for_excel(self, text: str) -> str:
"""清理文本移除Excel不支持的字符"""
if not isinstance(text, str):
return str(text)
# 移除或替换Excel不支持的控制字符
# 保留常见的可打印字符
cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
# 限制长度避免Excel单元格过长
if len(cleaned) > 32767: # Excel单元格字符限制
cleaned = cleaned[:32760] + "..."
return cleaned
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
"""第一步:按质量标准过滤题目"""
filtered_questions = []
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
if difficulty < min_level_difficulty:
continue
# 特殊过滤规则
if self._should_exclude_question(q):
continue
# # 特殊过滤规则
# if self._should_exclude_question(q):
# continue
filtered_questions.append(q)
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
if len(q['question'].strip()) < 20: # 题目太短
return False
if len(q['answer'].strip()) < 5: # 答案太短
return False
# if len(q['answer'].strip()) < 5: # 答案太短
# return False
return True
@@ -469,14 +484,25 @@ def main_filter_questions():
# 导出详细分析
try:
df_original = pd.DataFrame(all_questions)
df_selected = pd.DataFrame(selected_questions)
# 清理数据中的特殊字符
cleaned_selected = []
for q in selected_questions:
cleaned_q = {}
for key, value in q.items():
cleaned_q[key] = selector.clean_text_for_excel(value)
cleaned_selected.append(cleaned_q)
df_selected = pd.DataFrame(cleaned_selected)
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
# df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
# 只保存关键字段到Excel避免长文本问题
summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
# 统计对比
comparison_data = []
df_original = pd.DataFrame(all_questions)
for metric in ['question_type', 'knowledge_level', 'difficulty']:
orig_dist = df_original[metric].value_counts(normalize=True) * 100
sel_dist = df_selected[metric].value_counts(normalize=True) * 100