调整分类代码和质量筛选代码
This commit is contained in:
54974
layer2/PGEE/code/step4.1_enhanced_classified_questions.json
Normal file
54974
layer2/PGEE/code/step4.1_enhanced_classified_questions.json
Normal file
File diff suppressed because one or more lines are too long
54974
layer2/PGEE/code/step4_enhanced_classified_questions.json
Normal file
54974
layer2/PGEE/code/step4_enhanced_classified_questions.json
Normal file
File diff suppressed because one or more lines are too long
@@ -788,26 +788,23 @@ class EnhancedQuestionClassifier:
|
||||
f"等级{level}: {desc}" for level, desc in criteria.items()
|
||||
])
|
||||
|
||||
prompt = f"""请为以下题目在同题型内评估难度等级。
|
||||
prompt = f"""请为以下题目在选择题型内评估难度等级。
|
||||
|
||||
题目:{question}
|
||||
答案:{answer}
|
||||
题型:{type_info['name']} - {type_info['description']}
|
||||
正确选项:{answer}
|
||||
知识层次:{level_info['name']} - {level_info['description']}
|
||||
|
||||
在该题型和知识层次下的难度等级标准:
|
||||
在选择题型和不同知识层次下的难度等级标准:
|
||||
{criteria_desc}
|
||||
|
||||
重要说明:
|
||||
- 难度评估必须在相同题型内进行比较
|
||||
- 不同题型有不同的固有难度,需要排除题型本身的影响
|
||||
- 重点关注在该题型框架内的相对难度
|
||||
- 难度评估必须在选择题型内进行比较(仅给定正确选项)
|
||||
|
||||
评估考虑因素:
|
||||
- 同类题型中的知识点掌握深度要求
|
||||
- 同类题型中的解题步骤复杂程度
|
||||
- 在该题型内的相对难度水平
|
||||
- 对该题型能力的具体要求
|
||||
- 选择题型中的知识点掌握深度要求
|
||||
- 选择题型中的解题步骤复杂程度
|
||||
- 选择题型内的相对难度水平
|
||||
- 选择题型能力的具体要求
|
||||
|
||||
请严格按照以下格式返回:
|
||||
难度:[等级数字]
|
||||
@@ -1506,12 +1503,12 @@ def advanced_main():
|
||||
questions = classifier.load_questions_from_json(INPUT_FILE)
|
||||
import random
|
||||
random.shuffle(questions) # 打乱题目顺序
|
||||
questions = questions[:100] # 测试时可以先处理一小部分
|
||||
# questions = questions[:100] # 测试时可以先处理一小部分
|
||||
|
||||
print("开始三阶段分类处理...")
|
||||
classified_questions = classifier.classify_questions_batch(
|
||||
questions=questions,
|
||||
max_workers=10,
|
||||
max_workers=20,
|
||||
save_interval=10,
|
||||
output_file=OUTPUT_FILE
|
||||
)
|
||||
|
||||
File diff suppressed because one or more lines are too long
@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
|
||||
def __init__(self):
|
||||
# 各题型的最低难度阈值(排除过于简单的题目)
|
||||
self.min_difficulty_thresholds = {
|
||||
"calculation": 2, # 计算题:排除难度1的基础计算
|
||||
"short_answer": 2, # 简答题:排除难度1的简单记忆
|
||||
"true_false": 2, # 判断题:排除难度1的基础概念判断
|
||||
"multiple_choice": 2 # 选择题:排除难度1的简单选择
|
||||
"calculation": 1, # 计算题:排除难度1的基础计算
|
||||
"short_answer": 1, # 简答题:排除难度1的简单记忆
|
||||
"true_false": 1, # 判断题:排除难度1的基础概念判断
|
||||
"multiple_choice": 1 # 选择题:排除难度1的简单选择
|
||||
}
|
||||
|
||||
# 各知识层次的最低难度要求
|
||||
self.knowledge_level_min_difficulty = {
|
||||
"basic_concepts": 2, # 基础概念至少难度2(需要理解,不只是记忆)
|
||||
"simple_application": 2, # 简单应用至少难度2
|
||||
"basic_concepts": 1, # 基础概念至少难度2(需要理解,不只是记忆)
|
||||
"simple_application": 1, # 简单应用至少难度2
|
||||
"medium_application": 2, # 中等应用至少难度2
|
||||
"complex_analysis": 3, # 复杂分析至少难度3
|
||||
"advanced_synthesis": 4 # 高级综合至少难度4
|
||||
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
|
||||
"advanced_synthesis": 0.05 # 5% 高级综合
|
||||
},
|
||||
"difficulty_levels": {
|
||||
1: 0.05, # 5% 难度1(仅保留最有价值的)
|
||||
2: 0.25, # 25% 难度2
|
||||
1: 0.15, # 5% 难度1(仅保留最有价值的)
|
||||
2: 0.15, # 25% 难度2
|
||||
3: 0.35, # 35% 难度3
|
||||
4: 0.25, # 25% 难度4
|
||||
5: 0.10 # 10% 难度5
|
||||
}
|
||||
}
|
||||
|
||||
def clean_text_for_excel(self, text: str) -> str:
|
||||
"""清理文本,移除Excel不支持的字符"""
|
||||
if not isinstance(text, str):
|
||||
return str(text)
|
||||
|
||||
# 移除或替换Excel不支持的控制字符
|
||||
# 保留常见的可打印字符
|
||||
cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
|
||||
|
||||
# 限制长度,避免Excel单元格过长
|
||||
if len(cleaned) > 32767: # Excel单元格字符限制
|
||||
cleaned = cleaned[:32760] + "..."
|
||||
|
||||
return cleaned
|
||||
|
||||
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
|
||||
"""第一步:按质量标准过滤题目"""
|
||||
filtered_questions = []
|
||||
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
|
||||
if difficulty < min_level_difficulty:
|
||||
continue
|
||||
|
||||
# 特殊过滤规则
|
||||
if self._should_exclude_question(q):
|
||||
continue
|
||||
# # 特殊过滤规则
|
||||
# if self._should_exclude_question(q):
|
||||
# continue
|
||||
|
||||
filtered_questions.append(q)
|
||||
|
||||
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
|
||||
if len(q['question'].strip()) < 20: # 题目太短
|
||||
return False
|
||||
|
||||
if len(q['answer'].strip()) < 5: # 答案太短
|
||||
return False
|
||||
# if len(q['answer'].strip()) < 5: # 答案太短
|
||||
# return False
|
||||
|
||||
return True
|
||||
|
||||
@@ -469,14 +484,25 @@ def main_filter_questions():
|
||||
|
||||
# 导出详细分析
|
||||
try:
|
||||
df_original = pd.DataFrame(all_questions)
|
||||
df_selected = pd.DataFrame(selected_questions)
|
||||
# 清理数据中的特殊字符
|
||||
cleaned_selected = []
|
||||
for q in selected_questions:
|
||||
cleaned_q = {}
|
||||
for key, value in q.items():
|
||||
cleaned_q[key] = selector.clean_text_for_excel(value)
|
||||
cleaned_selected.append(cleaned_q)
|
||||
|
||||
df_selected = pd.DataFrame(cleaned_selected)
|
||||
|
||||
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
|
||||
df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
|
||||
# df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
|
||||
# 只保存关键字段到Excel,避免长文本问题
|
||||
summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
|
||||
summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
|
||||
|
||||
# 统计对比
|
||||
comparison_data = []
|
||||
df_original = pd.DataFrame(all_questions)
|
||||
for metric in ['question_type', 'knowledge_level', 'difficulty']:
|
||||
orig_dist = df_original[metric].value_counts(normalize=True) * 100
|
||||
sel_dist = df_selected[metric].value_counts(normalize=True) * 100
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user