调整分类代码和质量筛选代码
This commit is contained in:
54974
layer2/PGEE/code/step4.1_enhanced_classified_questions.json
Normal file
54974
layer2/PGEE/code/step4.1_enhanced_classified_questions.json
Normal file
File diff suppressed because one or more lines are too long
54974
layer2/PGEE/code/step4_enhanced_classified_questions.json
Normal file
54974
layer2/PGEE/code/step4_enhanced_classified_questions.json
Normal file
File diff suppressed because one or more lines are too long
@@ -788,26 +788,23 @@ class EnhancedQuestionClassifier:
|
|||||||
f"等级{level}: {desc}" for level, desc in criteria.items()
|
f"等级{level}: {desc}" for level, desc in criteria.items()
|
||||||
])
|
])
|
||||||
|
|
||||||
prompt = f"""请为以下题目在同题型内评估难度等级。
|
prompt = f"""请为以下题目在选择题型内评估难度等级。
|
||||||
|
|
||||||
题目:{question}
|
题目:{question}
|
||||||
答案:{answer}
|
正确选项:{answer}
|
||||||
题型:{type_info['name']} - {type_info['description']}
|
|
||||||
知识层次:{level_info['name']} - {level_info['description']}
|
知识层次:{level_info['name']} - {level_info['description']}
|
||||||
|
|
||||||
在该题型和知识层次下的难度等级标准:
|
在选择题型和不同知识层次下的难度等级标准:
|
||||||
{criteria_desc}
|
{criteria_desc}
|
||||||
|
|
||||||
重要说明:
|
重要说明:
|
||||||
- 难度评估必须在相同题型内进行比较
|
- 难度评估必须在选择题型内进行比较(仅给定正确选项)
|
||||||
- 不同题型有不同的固有难度,需要排除题型本身的影响
|
|
||||||
- 重点关注在该题型框架内的相对难度
|
|
||||||
|
|
||||||
评估考虑因素:
|
评估考虑因素:
|
||||||
- 同类题型中的知识点掌握深度要求
|
- 选择题型中的知识点掌握深度要求
|
||||||
- 同类题型中的解题步骤复杂程度
|
- 选择题型中的解题步骤复杂程度
|
||||||
- 在该题型内的相对难度水平
|
- 选择题型内的相对难度水平
|
||||||
- 对该题型能力的具体要求
|
- 选择题型能力的具体要求
|
||||||
|
|
||||||
请严格按照以下格式返回:
|
请严格按照以下格式返回:
|
||||||
难度:[等级数字]
|
难度:[等级数字]
|
||||||
@@ -1506,12 +1503,12 @@ def advanced_main():
|
|||||||
questions = classifier.load_questions_from_json(INPUT_FILE)
|
questions = classifier.load_questions_from_json(INPUT_FILE)
|
||||||
import random
|
import random
|
||||||
random.shuffle(questions) # 打乱题目顺序
|
random.shuffle(questions) # 打乱题目顺序
|
||||||
questions = questions[:100] # 测试时可以先处理一小部分
|
# questions = questions[:100] # 测试时可以先处理一小部分
|
||||||
|
|
||||||
print("开始三阶段分类处理...")
|
print("开始三阶段分类处理...")
|
||||||
classified_questions = classifier.classify_questions_batch(
|
classified_questions = classifier.classify_questions_batch(
|
||||||
questions=questions,
|
questions=questions,
|
||||||
max_workers=10,
|
max_workers=20,
|
||||||
save_interval=10,
|
save_interval=10,
|
||||||
output_file=OUTPUT_FILE
|
output_file=OUTPUT_FILE
|
||||||
)
|
)
|
||||||
|
|||||||
File diff suppressed because one or more lines are too long
@@ -10,16 +10,16 @@ class QuestionFilterAndSelector:
|
|||||||
def __init__(self):
|
def __init__(self):
|
||||||
# 各题型的最低难度阈值(排除过于简单的题目)
|
# 各题型的最低难度阈值(排除过于简单的题目)
|
||||||
self.min_difficulty_thresholds = {
|
self.min_difficulty_thresholds = {
|
||||||
"calculation": 2, # 计算题:排除难度1的基础计算
|
"calculation": 1, # 计算题:排除难度1的基础计算
|
||||||
"short_answer": 2, # 简答题:排除难度1的简单记忆
|
"short_answer": 1, # 简答题:排除难度1的简单记忆
|
||||||
"true_false": 2, # 判断题:排除难度1的基础概念判断
|
"true_false": 1, # 判断题:排除难度1的基础概念判断
|
||||||
"multiple_choice": 2 # 选择题:排除难度1的简单选择
|
"multiple_choice": 1 # 选择题:排除难度1的简单选择
|
||||||
}
|
}
|
||||||
|
|
||||||
# 各知识层次的最低难度要求
|
# 各知识层次的最低难度要求
|
||||||
self.knowledge_level_min_difficulty = {
|
self.knowledge_level_min_difficulty = {
|
||||||
"basic_concepts": 2, # 基础概念至少难度2(需要理解,不只是记忆)
|
"basic_concepts": 1, # 基础概念至少难度2(需要理解,不只是记忆)
|
||||||
"simple_application": 2, # 简单应用至少难度2
|
"simple_application": 1, # 简单应用至少难度2
|
||||||
"medium_application": 2, # 中等应用至少难度2
|
"medium_application": 2, # 中等应用至少难度2
|
||||||
"complex_analysis": 3, # 复杂分析至少难度3
|
"complex_analysis": 3, # 复杂分析至少难度3
|
||||||
"advanced_synthesis": 4 # 高级综合至少难度4
|
"advanced_synthesis": 4 # 高级综合至少难度4
|
||||||
@@ -41,14 +41,29 @@ class QuestionFilterAndSelector:
|
|||||||
"advanced_synthesis": 0.05 # 5% 高级综合
|
"advanced_synthesis": 0.05 # 5% 高级综合
|
||||||
},
|
},
|
||||||
"difficulty_levels": {
|
"difficulty_levels": {
|
||||||
1: 0.05, # 5% 难度1(仅保留最有价值的)
|
1: 0.15, # 5% 难度1(仅保留最有价值的)
|
||||||
2: 0.25, # 25% 难度2
|
2: 0.15, # 25% 难度2
|
||||||
3: 0.35, # 35% 难度3
|
3: 0.35, # 35% 难度3
|
||||||
4: 0.25, # 25% 难度4
|
4: 0.25, # 25% 难度4
|
||||||
5: 0.10 # 10% 难度5
|
5: 0.10 # 10% 难度5
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
def clean_text_for_excel(self, text: str) -> str:
|
||||||
|
"""清理文本,移除Excel不支持的字符"""
|
||||||
|
if not isinstance(text, str):
|
||||||
|
return str(text)
|
||||||
|
|
||||||
|
# 移除或替换Excel不支持的控制字符
|
||||||
|
# 保留常见的可打印字符
|
||||||
|
cleaned = ''.join(char if ord(char) >= 32 or char in '\t\n\r' else ' ' for char in text)
|
||||||
|
|
||||||
|
# 限制长度,避免Excel单元格过长
|
||||||
|
if len(cleaned) > 32767: # Excel单元格字符限制
|
||||||
|
cleaned = cleaned[:32760] + "..."
|
||||||
|
|
||||||
|
return cleaned
|
||||||
|
|
||||||
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
|
def filter_questions_by_quality(self, questions: List[Dict]) -> List[Dict]:
|
||||||
"""第一步:按质量标准过滤题目"""
|
"""第一步:按质量标准过滤题目"""
|
||||||
filtered_questions = []
|
filtered_questions = []
|
||||||
@@ -72,9 +87,9 @@ class QuestionFilterAndSelector:
|
|||||||
if difficulty < min_level_difficulty:
|
if difficulty < min_level_difficulty:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# 特殊过滤规则
|
# # 特殊过滤规则
|
||||||
if self._should_exclude_question(q):
|
# if self._should_exclude_question(q):
|
||||||
continue
|
# continue
|
||||||
|
|
||||||
filtered_questions.append(q)
|
filtered_questions.append(q)
|
||||||
|
|
||||||
@@ -92,8 +107,8 @@ class QuestionFilterAndSelector:
|
|||||||
if len(q['question'].strip()) < 20: # 题目太短
|
if len(q['question'].strip()) < 20: # 题目太短
|
||||||
return False
|
return False
|
||||||
|
|
||||||
if len(q['answer'].strip()) < 5: # 答案太短
|
# if len(q['answer'].strip()) < 5: # 答案太短
|
||||||
return False
|
# return False
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
|
||||||
@@ -469,14 +484,25 @@ def main_filter_questions():
|
|||||||
|
|
||||||
# 导出详细分析
|
# 导出详细分析
|
||||||
try:
|
try:
|
||||||
df_original = pd.DataFrame(all_questions)
|
# 清理数据中的特殊字符
|
||||||
df_selected = pd.DataFrame(selected_questions)
|
cleaned_selected = []
|
||||||
|
for q in selected_questions:
|
||||||
|
cleaned_q = {}
|
||||||
|
for key, value in q.items():
|
||||||
|
cleaned_q[key] = selector.clean_text_for_excel(value)
|
||||||
|
cleaned_selected.append(cleaned_q)
|
||||||
|
|
||||||
|
df_selected = pd.DataFrame(cleaned_selected)
|
||||||
|
|
||||||
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
|
with pd.ExcelWriter(ANALYSIS_FILE, engine='openpyxl') as writer:
|
||||||
df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
|
# df_selected.to_excel(writer, sheet_name='筛选结果', index=False)
|
||||||
|
# 只保存关键字段到Excel,避免长文本问题
|
||||||
|
summary_df = df_selected[['question_type', 'knowledge_level', 'difficulty', 'final_level']].copy()
|
||||||
|
summary_df.to_excel(writer, sheet_name='筛选结果概要', index=False)
|
||||||
|
|
||||||
# 统计对比
|
# 统计对比
|
||||||
comparison_data = []
|
comparison_data = []
|
||||||
|
df_original = pd.DataFrame(all_questions)
|
||||||
for metric in ['question_type', 'knowledge_level', 'difficulty']:
|
for metric in ['question_type', 'knowledge_level', 'difficulty']:
|
||||||
orig_dist = df_original[metric].value_counts(normalize=True) * 100
|
orig_dist = df_original[metric].value_counts(normalize=True) * 100
|
||||||
sel_dist = df_selected[metric].value_counts(normalize=True) * 100
|
sel_dist = df_selected[metric].value_counts(normalize=True) * 100
|
||||||
|
|||||||
Binary file not shown.
Binary file not shown.
Reference in New Issue
Block a user