过滤掉题目令人困惑的部分，且可以转换为简答题的题目

2025-05-29 16:09:37 +08:00
parent 72a236d505
commit ae410dc6a7
4 changed files with 105645 additions and 0 deletions
--- a/layer2/PGEE/code/step7_filter_perplexity_convert.py
+++ b/layer2/PGEE/code/step7_filter_perplexity_convert.py
@@ -0,0 +1,276 @@
+import json
+import logging
+from typing import Dict, Any, List
+from tqdm import tqdm
+
+# 设置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+logger = logging.getLogger(__name__)
+
+class QuestionFilter:
+    def __init__(self):
+        """
+        初始化题目过滤器
+        """
+        self.stats = {
+            'total_questions': 0,
+            'no_perp_convertible': 0,
+            'no_perp_no_convertible': 0,
+            'has_perp_convertible': 0,
+            'has_perp_no_convertible': 0,
+            'missing_fields': 0
+        }
+    
+    def filter_questions(self, questions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
+        """
+        根据条件过滤题目
+        
+        Args:
+            questions: 题目列表
+            
+        Returns:
+            Dict: 包含不同类别题目的字典
+        """
+        # 初始化结果字典
+        filtered_questions = {
+            'no_perp_convertible': [],      # has_perplexity=False 且 convertible=True
+            'no_perp_no_convertible': [],   # has_perplexity=False 且 convertible=False
+            'has_perp_convertible': [],     # has_perplexity=True 且 convertible=True
+            'has_perp_no_convertible': [],  # has_perplexity=True 且 convertible=False
+            'missing_fields': []            # 缺少必要字段的题目
+        }
+        
+        self.stats['total_questions'] = len(questions)
+        
+        logger.info(f"开始过滤 {len(questions)} 道题目...")
+        
+        # 使用进度条处理题目
+        with tqdm(
+            total=len(questions),
+            desc="过滤题目",
+            ncols=100,
+            unit="题",
+            bar_format="{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt}] {postfix}"
+        ) as pbar:
+            
+            for question in questions:
+                try:
+                    # 获取必要的字段
+                    convertible = question.get("convertible")
+                    perplexity_info = question.get("perplexity", {})
+                    has_perplexity = perplexity_info.get("has_perplexity")
+                    
+                    # 检查必要字段是否存在
+                    if convertible is None or has_perplexity is None:
+                        filtered_questions['missing_fields'].append(question)
+                        self.stats['missing_fields'] += 1
+                        pbar.set_postfix(status="缺少字段")
+                    else:
+                        # 根据条件分类
+                        if not has_perplexity and convertible:
+                            # has_perplexity=False 且 convertible=True
+                            filtered_questions['no_perp_convertible'].append(question)
+                            self.stats['no_perp_convertible'] += 1
+                            pbar.set_postfix(status="无困惑+可转换")
+                            
+                        elif not has_perplexity and not convertible:
+                            # has_perplexity=False 且 convertible=False
+                            filtered_questions['no_perp_no_convertible'].append(question)
+                            self.stats['no_perp_no_convertible'] += 1
+                            pbar.set_postfix(status="无困惑+不可转换")
+                            
+                        elif has_perplexity and convertible:
+                            # has_perplexity=True 且 convertible=True
+                            filtered_questions['has_perp_convertible'].append(question)
+                            self.stats['has_perp_convertible'] += 1
+                            pbar.set_postfix(status="有困惑+可转换")
+                            
+                        elif has_perplexity and not convertible:
+                            # has_perplexity=True 且 convertible=False
+                            filtered_questions['has_perp_no_convertible'].append(question)
+                            self.stats['has_perp_no_convertible'] += 1
+                            pbar.set_postfix(status="有困惑+不可转换")
+                    
+                    # 更新进度条
+                    pbar.update(1)
+                    
+                except Exception as e:
+                    logger.error(f"处理题目时出错: {e}")
+                    filtered_questions['missing_fields'].append(question)
+                    self.stats['missing_fields'] += 1
+                    pbar.update(1)
+        
+        logger.info("题目过滤完成!")
+        return filtered_questions
+    
+    def save_filtered_questions(self, filtered_questions: Dict[str, List[Dict[str, Any]]], 
+                               output_dir: str = "."):
+        """
+        保存过滤后的题目到不同的JSON文件
+        
+        Args:
+            filtered_questions: 过滤后的题目字典
+            output_dir: 输出目录
+        """
+        # 定义输出文件映射
+        file_mappings = {
+            'no_perp_convertible': f"{output_dir}/no_perp_convertible.json",
+            'no_perp_no_convertible': f"{output_dir}/no_perp_no_convertible.json",
+            'has_perp_convertible': f"{output_dir}/has_perp_convertible.json",
+            'has_perp_no_convertible': f"{output_dir}/has_perp_no_convertible.json",
+            'missing_fields': f"{output_dir}/missing_fields.json"
+        }
+        
+        logger.info("开始保存过滤后的题目...")
+        
+        # 保存每个类别的题目
+        for category, questions in filtered_questions.items():
+            if questions:  # 只保存非空的类别
+                output_file = file_mappings[category]
+                try:
+                    with tqdm(desc=f"保存 {category}", unit="题", total=len(questions)) as pbar:
+                        with open(output_file, 'w', encoding='utf-8') as f:
+                            json.dump(questions, f, ensure_ascii=False, indent=2)
+                        pbar.update(len(questions))
+                    
+                    logger.info(f"已保存 {len(questions)} 道题目到: {output_file}")
+                    
+                except Exception as e:
+                    logger.error(f"保存文件 {output_file} 失败: {e}")
+            else:
+                logger.info(f"类别 {category} 没有题目，跳过保存")
+    
+    def print_statistics(self):
+        """
+        打印统计信息
+        """
+        total = self.stats['total_questions']
+        
+        logger.info("="*60)
+        logger.info("题目过滤统计结果:")
+        logger.info("="*60)
+        logger.info(f"总题目数量: {total}")
+        logger.info("")
+        
+        logger.info("各类别题目数量:")
+        logger.info(f"  ✅ 无困惑 + 可转换:     {self.stats['no_perp_convertible']:>6} ({self.stats['no_perp_convertible']/total*100:.1f}%)")
+        logger.info(f"  ❌ 无困惑 + 不可转换:   {self.stats['no_perp_no_convertible']:>6} ({self.stats['no_perp_no_convertible']/total*100:.1f}%)")
+        logger.info(f"  ⚠️  有困惑 + 可转换:     {self.stats['has_perp_convertible']:>6} ({self.stats['has_perp_convertible']/total*100:.1f}%)")
+        logger.info(f"  🚫 有困惑 + 不可转换:   {self.stats['has_perp_no_convertible']:>6} ({self.stats['has_perp_no_convertible']/total*100:.1f}%)")
+        logger.info(f"  ❓ 缺少必要字段:       {self.stats['missing_fields']:>6} ({self.stats['missing_fields']/total*100:.1f}%)")
+        logger.info("")
+        
+        # 验证总数
+        calculated_total = (self.stats['no_perp_convertible'] + 
+                          self.stats['no_perp_no_convertible'] + 
+                          self.stats['has_perp_convertible'] + 
+                          self.stats['has_perp_no_convertible'] + 
+                          self.stats['missing_fields'])
+        
+        logger.info(f"验证: 分类总数 = {calculated_total} (应该等于 {total})")
+        logger.info("="*60)
+        
+        # 重点关注的类别
+        logger.info("📋 重点输出文件:")
+        logger.info(f"  • no_perp_convertible.json:    {self.stats['no_perp_convertible']} 道题目 (理想的选择题)")
+        logger.info(f"  • no_perp_no_convertible.json: {self.stats['no_perp_no_convertible']} 道题目 (无法转换的题目)")
+    
+    def analyze_sample_questions(self, filtered_questions: Dict[str, List[Dict[str, Any]]], 
+                               sample_size: int = 3):
+        """
+        分析并展示样本题目
+        
+        Args:
+            filtered_questions: 过滤后的题目字典
+            sample_size: 每个类别展示的样本数量
+        """
+        logger.info("\n📖 样本题目展示:")
+        logger.info("="*60)
+        
+        for category, questions in filtered_questions.items():
+            if questions and category in ['no_perp_convertible', 'no_perp_no_convertible']:
+                logger.info(f"\n【{category}】类别样本:")
+                
+                sample_count = min(sample_size, len(questions))
+                for i, question in enumerate(questions[:sample_count]):
+                    logger.info(f"  样本 {i+1}:")
+                    logger.info(f"    题目ID: {question.get('idx', 'N/A')}")
+                    logger.info(f"    原题目: {question.get('question', '')[:60]}...")
+                    
+                    if question.get('choice_question'):
+                        logger.info(f"    转换后: {question.get('choice_question', '')[:60]}...")
+                        logger.info(f"    正确选项: {question.get('correct_option', '')[:40]}...")
+                    
+                    logger.info(f"    可转换: {question.get('convertible', False)}")
+                    
+                    perplexity_info = question.get('perplexity', {})
+                    logger.info(f"    有困惑: {perplexity_info.get('has_perplexity', False)}")
+                    
+                    if perplexity_info.get('has_perplexity', False):
+                        logger.info(f"    困惑原因: {perplexity_info.get('perplexity_reason', '')[:50]}...")
+                    
+                    logger.info("    " + "-"*40)
+
+def load_questions(input_file: str) -> List[Dict[str, Any]]:
+    """
+    从JSON文件加载题目数据
+    
+    Args:
+        input_file: 输入文件路径
+        
+    Returns:
+        List: 题目列表
+    """
+    try:
+        with tqdm(desc="加载文件", unit="B", unit_scale=True) as pbar:
+            with open(input_file, 'r', encoding='utf-8') as f:
+                questions = json.load(f)
+                pbar.update(1)
+        
+        logger.info(f"成功加载 {len(questions)} 道题目")
+        return questions
+    except Exception as e:
+        logger.error(f"加载文件失败: {e}")
+        return []
+
+def main():
+    """
+    主函数 - 执行题目过滤
+    """
+    # ========== 配置区域 ==========
+    # 文件路径配置
+    INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step6_perplexity_analyzed_questions.json"  # 输入文件路径
+    OUTPUT_DIR = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code"  # 输出目录
+    # ============================
+    
+    try:
+        print("🔄 开始题目过滤...")
+        
+        # 加载题目数据
+        questions = load_questions(INPUT_FILE)
+        if not questions:
+            logger.error("没有加载到有效的题目数据")
+            return
+        
+        # 初始化过滤器
+        filter_obj = QuestionFilter()
+        
+        # 过滤题目
+        filtered_questions = filter_obj.filter_questions(questions)
+        
+        # 保存过滤后的题目
+        filter_obj.save_filtered_questions(filtered_questions, OUTPUT_DIR)
+        
+        # 打印统计信息
+        filter_obj.print_statistics()
+        
+        # 分析样本题目
+        filter_obj.analyze_sample_questions(filtered_questions)
+        
+        print("✅ 题目过滤完成!")
+        
+    except Exception as e:
+        logger.error(f"程序执行失败: {e}")
+
+if __name__ == "__main__":
+    main()
--- a/layer2/PGEE/code/step7_has_perp_convertible.json
+++ b/layer2/PGEE/code/step7_has_perp_convertible.json
--- a/layer2/PGEE/code/step7_no_perp_convertible.json
+++ b/layer2/PGEE/code/step7_no_perp_convertible.json
--- a/layer2/PGEE/code/step7_no_perp_no_convertible.json
+++ b/layer2/PGEE/code/step7_no_perp_no_convertible.json