过滤掉题目令人困惑的部分,且可以转换为简答题的题目

This commit is contained in:
lzy
2025-05-29 16:09:37 +08:00
parent 72a236d505
commit ae410dc6a7
4 changed files with 105645 additions and 0 deletions

View File

@@ -0,0 +1,276 @@
import json
import logging
from typing import Dict, Any, List
from tqdm import tqdm
# 设置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class QuestionFilter:
def __init__(self):
"""
初始化题目过滤器
"""
self.stats = {
'total_questions': 0,
'no_perp_convertible': 0,
'no_perp_no_convertible': 0,
'has_perp_convertible': 0,
'has_perp_no_convertible': 0,
'missing_fields': 0
}
def filter_questions(self, questions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
根据条件过滤题目
Args:
questions: 题目列表
Returns:
Dict: 包含不同类别题目的字典
"""
# 初始化结果字典
filtered_questions = {
'no_perp_convertible': [], # has_perplexity=False 且 convertible=True
'no_perp_no_convertible': [], # has_perplexity=False 且 convertible=False
'has_perp_convertible': [], # has_perplexity=True 且 convertible=True
'has_perp_no_convertible': [], # has_perplexity=True 且 convertible=False
'missing_fields': [] # 缺少必要字段的题目
}
self.stats['total_questions'] = len(questions)
logger.info(f"开始过滤 {len(questions)} 道题目...")
# 使用进度条处理题目
with tqdm(
total=len(questions),
desc="过滤题目",
ncols=100,
unit="",
bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}] {postfix}"
) as pbar:
for question in questions:
try:
# 获取必要的字段
convertible = question.get("convertible")
perplexity_info = question.get("perplexity", {})
has_perplexity = perplexity_info.get("has_perplexity")
# 检查必要字段是否存在
if convertible is None or has_perplexity is None:
filtered_questions['missing_fields'].append(question)
self.stats['missing_fields'] += 1
pbar.set_postfix(status="缺少字段")
else:
# 根据条件分类
if not has_perplexity and convertible:
# has_perplexity=False 且 convertible=True
filtered_questions['no_perp_convertible'].append(question)
self.stats['no_perp_convertible'] += 1
pbar.set_postfix(status="无困惑+可转换")
elif not has_perplexity and not convertible:
# has_perplexity=False 且 convertible=False
filtered_questions['no_perp_no_convertible'].append(question)
self.stats['no_perp_no_convertible'] += 1
pbar.set_postfix(status="无困惑+不可转换")
elif has_perplexity and convertible:
# has_perplexity=True 且 convertible=True
filtered_questions['has_perp_convertible'].append(question)
self.stats['has_perp_convertible'] += 1
pbar.set_postfix(status="有困惑+可转换")
elif has_perplexity and not convertible:
# has_perplexity=True 且 convertible=False
filtered_questions['has_perp_no_convertible'].append(question)
self.stats['has_perp_no_convertible'] += 1
pbar.set_postfix(status="有困惑+不可转换")
# 更新进度条
pbar.update(1)
except Exception as e:
logger.error(f"处理题目时出错: {e}")
filtered_questions['missing_fields'].append(question)
self.stats['missing_fields'] += 1
pbar.update(1)
logger.info("题目过滤完成!")
return filtered_questions
def save_filtered_questions(self, filtered_questions: Dict[str, List[Dict[str, Any]]],
output_dir: str = "."):
"""
保存过滤后的题目到不同的JSON文件
Args:
filtered_questions: 过滤后的题目字典
output_dir: 输出目录
"""
# 定义输出文件映射
file_mappings = {
'no_perp_convertible': f"{output_dir}/no_perp_convertible.json",
'no_perp_no_convertible': f"{output_dir}/no_perp_no_convertible.json",
'has_perp_convertible': f"{output_dir}/has_perp_convertible.json",
'has_perp_no_convertible': f"{output_dir}/has_perp_no_convertible.json",
'missing_fields': f"{output_dir}/missing_fields.json"
}
logger.info("开始保存过滤后的题目...")
# 保存每个类别的题目
for category, questions in filtered_questions.items():
if questions: # 只保存非空的类别
output_file = file_mappings[category]
try:
with tqdm(desc=f"保存 {category}", unit="", total=len(questions)) as pbar:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(questions, f, ensure_ascii=False, indent=2)
pbar.update(len(questions))
logger.info(f"已保存 {len(questions)} 道题目到: {output_file}")
except Exception as e:
logger.error(f"保存文件 {output_file} 失败: {e}")
else:
logger.info(f"类别 {category} 没有题目,跳过保存")
def print_statistics(self):
"""
打印统计信息
"""
total = self.stats['total_questions']
logger.info("="*60)
logger.info("题目过滤统计结果:")
logger.info("="*60)
logger.info(f"总题目数量: {total}")
logger.info("")
logger.info("各类别题目数量:")
logger.info(f" ✅ 无困惑 + 可转换: {self.stats['no_perp_convertible']:>6} ({self.stats['no_perp_convertible']/total*100:.1f}%)")
logger.info(f" ❌ 无困惑 + 不可转换: {self.stats['no_perp_no_convertible']:>6} ({self.stats['no_perp_no_convertible']/total*100:.1f}%)")
logger.info(f" ⚠️ 有困惑 + 可转换: {self.stats['has_perp_convertible']:>6} ({self.stats['has_perp_convertible']/total*100:.1f}%)")
logger.info(f" 🚫 有困惑 + 不可转换: {self.stats['has_perp_no_convertible']:>6} ({self.stats['has_perp_no_convertible']/total*100:.1f}%)")
logger.info(f" ❓ 缺少必要字段: {self.stats['missing_fields']:>6} ({self.stats['missing_fields']/total*100:.1f}%)")
logger.info("")
# 验证总数
calculated_total = (self.stats['no_perp_convertible'] +
self.stats['no_perp_no_convertible'] +
self.stats['has_perp_convertible'] +
self.stats['has_perp_no_convertible'] +
self.stats['missing_fields'])
logger.info(f"验证: 分类总数 = {calculated_total} (应该等于 {total})")
logger.info("="*60)
# 重点关注的类别
logger.info("📋 重点输出文件:")
logger.info(f" • no_perp_convertible.json: {self.stats['no_perp_convertible']} 道题目 (理想的选择题)")
logger.info(f" • no_perp_no_convertible.json: {self.stats['no_perp_no_convertible']} 道题目 (无法转换的题目)")
def analyze_sample_questions(self, filtered_questions: Dict[str, List[Dict[str, Any]]],
sample_size: int = 3):
"""
分析并展示样本题目
Args:
filtered_questions: 过滤后的题目字典
sample_size: 每个类别展示的样本数量
"""
logger.info("\n📖 样本题目展示:")
logger.info("="*60)
for category, questions in filtered_questions.items():
if questions and category in ['no_perp_convertible', 'no_perp_no_convertible']:
logger.info(f"\n{category}】类别样本:")
sample_count = min(sample_size, len(questions))
for i, question in enumerate(questions[:sample_count]):
logger.info(f" 样本 {i+1}:")
logger.info(f" 题目ID: {question.get('idx', 'N/A')}")
logger.info(f" 原题目: {question.get('question', '')[:60]}...")
if question.get('choice_question'):
logger.info(f" 转换后: {question.get('choice_question', '')[:60]}...")
logger.info(f" 正确选项: {question.get('correct_option', '')[:40]}...")
logger.info(f" 可转换: {question.get('convertible', False)}")
perplexity_info = question.get('perplexity', {})
logger.info(f" 有困惑: {perplexity_info.get('has_perplexity', False)}")
if perplexity_info.get('has_perplexity', False):
logger.info(f" 困惑原因: {perplexity_info.get('perplexity_reason', '')[:50]}...")
logger.info(" " + "-"*40)
def load_questions(input_file: str) -> List[Dict[str, Any]]:
"""
从JSON文件加载题目数据
Args:
input_file: 输入文件路径
Returns:
List: 题目列表
"""
try:
with tqdm(desc="加载文件", unit="B", unit_scale=True) as pbar:
with open(input_file, 'r', encoding='utf-8') as f:
questions = json.load(f)
pbar.update(1)
logger.info(f"成功加载 {len(questions)} 道题目")
return questions
except Exception as e:
logger.error(f"加载文件失败: {e}")
return []
def main():
"""
主函数 - 执行题目过滤
"""
# ========== 配置区域 ==========
# 文件路径配置
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step6_perplexity_analyzed_questions.json" # 输入文件路径
OUTPUT_DIR = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code" # 输出目录
# ============================
try:
print("🔄 开始题目过滤...")
# 加载题目数据
questions = load_questions(INPUT_FILE)
if not questions:
logger.error("没有加载到有效的题目数据")
return
# 初始化过滤器
filter_obj = QuestionFilter()
# 过滤题目
filtered_questions = filter_obj.filter_questions(questions)
# 保存过滤后的题目
filter_obj.save_filtered_questions(filtered_questions, OUTPUT_DIR)
# 打印统计信息
filter_obj.print_statistics()
# 分析样本题目
filter_obj.analyze_sample_questions(filtered_questions)
print("✅ 题目过滤完成!")
except Exception as e:
logger.error(f"程序执行失败: {e}")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long