调整格式转换的代码：清理无用逻辑并加入选项平衡；

2025-06-02 17:17:42 +08:00
parent abeacaac3e
commit 7a725bc003
2 changed files with 38118 additions and 629 deletions
--- a/layer2/PGEE/code/stepz_final_choice_questions_filtered.json
+++ b/layer2/PGEE/code/stepz_final_choice_questions_filtered.json
--- a/layer2/PGEE/code/stepz_final_format_convert.py
+++ b/layer2/PGEE/code/stepz_final_format_convert.py
@@ -1,43 +1,33 @@
 import json
 from typing import Dict, Any, List, Optional, Tuple
 import random
+from collections import Counter

 def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
    """
    将源JSON格式转换为目标格式
-    
-    Args:
-        source_data: 源数据字典
-        
-    Returns:
-        转换后的数据字典，如果不是单选题则返回None
    """
-    # 检查是否有generated_options字段
    if "generated_options" not in source_data:
        return None
    
    generated_options = source_data["generated_options"]
    
-    # 只处理单选题，跳过判断题
+    # 只处理单选题
    if generated_options.get("question_type") != "multiple_choice":
        return None
    
-    # 获取题目内容
    question = source_data.get("choice_question", "")
    if not question:
        return None
    
-    # 获取选项
    options = generated_options.get("options", {})
    if len(options) != 4:
        return None
    
-    # 获取正确答案
    correct_answer = generated_options.get("correct_answer", "")
    if correct_answer not in ["A", "B", "C", "D"]:
        return None
    
-    # 构建目标格式
    target_data = {
        "question": question,
        "choices": {
@@ -55,6 +45,188 @@ def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str,
    
    return target_data

+def extract_answer_from_question(question: Dict[str, Any]) -> Optional[str]:
+    """从转换后的题目中提取答案选项"""
+    answer_text = question.get("answer", "")
+    if answer_text.startswith("[ANSWER]") and answer_text.endswith("[/ANSWER]"):
+        answer = answer_text[8:-9]
+        if answer in ["A", "B", "C", "D"]:
+            return answer
+    return None
+
+def shuffle_question_options(question: Dict[str, Any], new_correct_answer: str) -> Dict[str, Any]:
+    """
+    重新排列题目选项，使正确答案变为指定选项
+    
+    Args:
+        question: 题目字典
+        new_correct_answer: 新的正确答案选项 (A/B/C/D)
+        
+    Returns:
+        重新排列后的题目
+    """
+    # 获取当前正确答案
+    current_answer = extract_answer_from_question(question)
+    if not current_answer:
+        return question
+    
+    # 如果已经是目标答案，不需要改变
+    if current_answer == new_correct_answer:
+        return question
+    
+    # 获取当前选项
+    choices = question.get("choices", {})
+    current_texts = choices.get("text", [])
+    current_labels = choices.get("label", ["A", "B", "C", "D"])
+    
+    if len(current_texts) != 4 or len(current_labels) != 4:
+        return question
+    
+    # 找到当前正确答案的索引
+    current_index = current_labels.index(current_answer)
+    new_index = current_labels.index(new_correct_answer)
+    
+    # 交换选项
+    new_texts = current_texts[:]
+    new_texts[new_index], new_texts[current_index] = new_texts[current_index], new_texts[new_index]
+    
+    # 创建新的题目
+    new_question = question.copy()
+    new_question["choices"] = {
+        "text": new_texts,
+        "label": ["A", "B", "C", "D"]
+    }
+    new_question["answer"] = f"[ANSWER]{new_correct_answer}[/ANSWER]"
+    
+    return new_question
+
+def balance_answer_distribution_by_shuffling(questions: List[Dict[str, Any]], 
+                                           random_seed: Optional[int] = None) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
+    """
+    通过重新排列选项来平衡答案分布
+    
+    Args:
+        questions: 题目列表
+        random_seed: 随机种子
+        
+    Returns:
+        平衡后的题目列表和统计信息
+    """
+    if random_seed is not None:
+        random.seed(random_seed)
+    
+    total_questions = len(questions)
+    target_per_answer = total_questions // 4
+    remainder = total_questions % 4
+    
+    print(f"\n=== 答案分布平衡 (重排选项法) ===")
+    print(f"总题目数: {total_questions}")
+    print(f"标准分配: 每个选项 {target_per_answer} 道题")
+    if remainder > 0:
+        print(f"余数: {remainder} 道题 (将分配给前{remainder}个选项)")
+    
+    # 统计当前答案分布
+    answer_groups = {"A": [], "B": [], "C": [], "D": []}
+    for i, question in enumerate(questions):
+        answer = extract_answer_from_question(question)
+        if answer and answer in answer_groups:
+            answer_groups[answer].append((i, question))
+    
+    print(f"\n当前答案分布:")
+    for answer in ["A", "B", "C", "D"]:
+        count = len(answer_groups[answer])
+        ratio = count / total_questions if total_questions > 0 else 0
+        print(f"  {answer}: {count} ({ratio*100:.1f}%)")
+    
+    # 计算目标分配（前remainder个选项多分配1道题）
+    target_counts = {}
+    for i, answer in enumerate(["A", "B", "C", "D"]):
+        if i < remainder:
+            target_counts[answer] = target_per_answer + 1
+        else:
+            target_counts[answer] = target_per_answer
+    
+    print(f"\n目标分配:")
+    for answer in ["A", "B", "C", "D"]:
+        print(f"  {answer}: {target_counts[answer]} 道题")
+    
+    # 计算需要调整的数量
+    surplus_questions = []  # (question_index, question, from_answer)
+    deficit_needed = []     # (to_answer, count_needed)
+    
+    for answer in ["A", "B", "C", "D"]:
+        current_count = len(answer_groups[answer])
+        target_count = target_counts[answer]
+        difference = current_count - target_count
+        
+        if difference > 0:
+            # 有多余的题目，需要转移出去
+            print(f"  {answer}: 多 {difference} 道题")
+            # 随机选择要转移的题目
+            questions_to_move = random.sample(answer_groups[answer], difference)
+            for q_idx, q in questions_to_move:
+                surplus_questions.append((q_idx, q, answer))
+        elif difference < 0:
+            # 缺少题目，需要接收
+            needed = -difference
+            print(f"  {answer}: 少 {needed} 道题")
+            deficit_needed.extend([(answer, 1)] * needed)
+    
+    # 打乱顺序以避免偏向性
+    random.shuffle(surplus_questions)
+    random.shuffle(deficit_needed)
+    
+    # 执行调整
+    balanced_questions = questions[:]  # 复制原题目列表
+    
+    print(f"\n开始重新分配 {len(surplus_questions)} 道题:")
+    
+    for i, ((q_idx, question, from_answer), (to_answer, _)) in enumerate(zip(surplus_questions, deficit_needed)):
+        # 重新排列这道题的选项
+        new_question = shuffle_question_options(question, to_answer)
+        balanced_questions[q_idx] = new_question
+        
+        print(f"  第{i+1}次调整: 题目{q_idx+1} 答案从 {from_answer} 改为 {to_answer}")
+    
+    # 验证最终分布
+    final_counter = Counter()
+    for question in balanced_questions:
+        answer = extract_answer_from_question(question)
+        if answer:
+            final_counter[answer] += 1
+    
+    print(f"\n平衡后答案分布:")
+    max_deviation = 0
+    target_ratio = 0.25
+    
+    for answer in ["A", "B", "C", "D"]:
+        count = final_counter.get(answer, 0)
+        ratio = count / total_questions if total_questions > 0 else 0
+        deviation = abs(ratio - target_ratio)
+        max_deviation = max(max_deviation, deviation)
+        print(f"  {answer}: {count} ({ratio*100:.1f}%)")
+    
+    # 统计信息
+    balance_info = {
+        "original_total": total_questions,
+        "final_total": total_questions,  # 题目总数不变
+        "target_per_answer": target_per_answer,
+        "remainder": remainder,
+        "final_distribution": dict(final_counter),
+        "max_deviation": max_deviation,
+        "adjustments_made": len(surplus_questions),
+        "perfectly_balanced": max_deviation <= 0.05
+    }
+    
+    if balance_info["perfectly_balanced"]:
+        print(f"✅ 完美平衡！最大偏差: {max_deviation*100:.1f}%")
+    else:
+        print(f"📊 接近平衡，最大偏差: {max_deviation*100:.1f}%")
+    
+    print(f"总共调整了 {balance_info['adjustments_made']} 道题的答案")
+    
+    return balanced_questions, balance_info
+
 def classify_questions_by_difficulty(questions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    """
    按难度分类题目
@@ -142,29 +314,34 @@ def select_questions_by_ratio(difficulty_groups: Dict[str, List[Dict[str, Any]]]
 def batch_convert_questions_with_difficulty_filter(input_file: str, 
                                                 output_file: str,
                                                 selection_ratios: Dict[str, float],
+                                                 balance_answers: bool = True,
                                                 random_seed: Optional[int] = None) -> None:
    """
-    批量转换题目格式，支持按难度筛选
+    批量转换题目格式，支持按难度筛选和答案平衡
    
    Args:
        input_file: 输入文件路径
        output_file: 输出文件路径
        selection_ratios: 各难度等级的选择比例
+        balance_answers: 是否平衡答案分布
        random_seed: 随机种子
    """
-    print("正在加载数据...")
+    print("=== 批量转换题目（难度筛选 + 答案平衡）===")
+    print(f"输入文件: {input_file}")
+    print(f"输出文件: {output_file}")
+    print(f"答案平衡: {'开启' if balance_answers else '关闭'}")
+    print(f"随机种子: {random_seed}")
    
-    # 判断输入文件格式
+    # 加载数据
+    print("\n正在加载数据...")
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    # 处理两种可能的输入格式
    if isinstance(data, dict) and "questions" in data:
-        # 格式：{"questions": [...], "其他字段": ...}
        source_questions = data["questions"]
        print(f"检测到完整格式数据，包含其他元数据")
    elif isinstance(data, list):
-        # 格式：[{题目1}, {题目2}, ...]
        source_questions = data
        print(f"检测到题目列表格式")
    else:
@@ -173,7 +350,7 @@ def batch_convert_questions_with_difficulty_filter(input_file: str,
    print(f"加载了 {len(source_questions)} 道题目")
    
    # 按难度分类题目
-    print("正在按难度分类题目...")
+    print("\n正在按难度分类题目...")
    difficulty_groups = classify_questions_by_difficulty(source_questions)
    
    print("题目难度分布:")
@@ -247,42 +424,48 @@ def batch_convert_questions_with_difficulty_filter(input_file: str,
            print(f"第{i+1}题转换失败: {e}")
            conversion_stats["failed"] += 1
    
+    print(f"转换完成: {conversion_stats['converted']} 道题目成功转换")
+    
+    # 对转换后的题目进行答案分布平衡
+    balance_info = None
+    if balance_answers and converted_questions:
+        print("\n正在对转换后的题目进行答案分布平衡...")
+        
+        balanced_questions, balance_info = balance_answer_distribution_by_shuffling(
+            converted_questions,
+            random_seed=random_seed
+        )
+        
+        converted_questions = balanced_questions
+        conversion_stats["final_count"] = len(converted_questions)
+    
    # 保存结果
    print("正在保存转换结果...")
-    output_data = {
-        "questions": converted_questions,
-        "metadata": {
-            "total_original_questions": len(source_questions),
-            "selection_ratios": selection_ratios,
-            "selection_stats": selection_stats,
-            "conversion_stats": conversion_stats,
-            "random_seed": random_seed
-        }
-    }
    
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(converted_questions, f, ensure_ascii=False, indent=2)
    
    # 打印最终统计信息
-    print(f"\n转换完成！")
+    print(f"\n=== 转换完成！===")
    print(f"选中题目数: {conversion_stats['selected']}")
    print(f"单选题: {conversion_stats['multiple_choice']}")
    print(f"判断题: {conversion_stats['true_false']}")
    print(f"其他类型: {conversion_stats['other']}")
    print(f"成功转换: {conversion_stats['converted']}")
    print(f"转换失败: {conversion_stats['failed']}")
-    print(f"最终转换率: {conversion_stats['converted']/conversion_stats['selected']*100:.1f}%")
+    
+    if balance_answers and balance_info:
+        print(f"答案平衡后: {conversion_stats.get('final_count', conversion_stats['converted'])}")
+        print(f"调整题目数: {balance_info['adjustments_made']}")
+        print(f"最终转换率: {conversion_stats.get('final_count', conversion_stats['converted'])/conversion_stats['selected']*100:.1f}%")
+    else:
+        print(f"最终转换率: {conversion_stats['converted']/conversion_stats['selected']*100:.1f}%")
+    
    print(f"结果已保存到: {output_file}")

 def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, int]:
    """
    验证转换后的题目格式
-    
-    Args:
-        questions: 转换后的题目列表
-        
-    Returns:
-        验证统计信息
    """
    stats = {
        "total": len(questions),
@@ -323,29 +506,9 @@ def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, i
            stats["valid"] += 1
        else:
            stats["invalid"] += 1
-            print(f"第{i+1}题格式无效")
    
    return stats

-def create_difficulty_config_template():
-    """创建难度配置模板"""
-    template = {
-        "hard_early_stop": 1.0,     # 困难题选择100%
-        "easy_all_correct": 0.1,    # 简单题选择10%
-        "mixed": 0.5,               # 混合题选择50%
-        "unknown": 0.0              # 未知难度题目选择0%
-    }
-    
-    print("难度选择比例配置模板:")
-    print(json.dumps(template, indent=2))
-    print("\n说明:")
-    print("- 1.0 = 100% (全部选择)")
-    print("- 0.5 = 50% (选择一半)")
-    print("- 0.1 = 10% (选择10%)")
-    print("- 0.0 = 0% (不选择)")
-    
-    return template
-
 def main():
    """主函数"""
    # 文件路径配置
@@ -353,17 +516,19 @@ def main():
    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
    
    # 难度选择比例配置
-    # 可以根据需要调整这些比例
    SELECTION_RATIOS = {
-        "hard_early_stop": 1.0,     # 困难题选择100% (全部)
-        "easy_all_correct": 0.0,    # 简单题选择10%
-        "mixed": 0.0,               # 混合题选择30%
+        "hard_early_stop": 1.0,     # 困难题选择10%
+        "easy_all_correct": 0.35,  # 简单题选择3.5%
+        "mixed": 0.0,               # 混合题选择0%
        "unknown": 0.0              # 未知难度不选择
    }
    
    # 随机种子，保证结果可复现
    RANDOM_SEED = 42
    
+    # 是否启用答案平衡
+    BALANCE_ANSWERS = True
+    
    try:
        # 显示配置信息
        print("=== 难度筛选配置 ===")
@@ -371,14 +536,16 @@ def main():
        for difficulty, ratio in SELECTION_RATIOS.items():
            print(f"  {difficulty}: {ratio*100:.1f}%")
        print(f"随机种子: {RANDOM_SEED}")
+        print(f"启用答案平衡: {BALANCE_ANSWERS}")
        print()
        
-        # 批量转换（包含难度筛选）
+        # 批量转换（包含难度筛选和答案平衡）
        batch_convert_questions_with_difficulty_filter(
            INPUT_FILE, 
            OUTPUT_FILE, 
            SELECTION_RATIOS,
-            RANDOM_SEED
+            balance_answers=BALANCE_ANSWERS,
+            random_seed=RANDOM_SEED
        )
        
        # 验证转换结果
@@ -386,19 +553,7 @@ def main():
        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
            result_data = json.load(f)
        
-        # 检查输出文件格式
-        if "questions" in result_data:
-            converted_questions = result_data["questions"]
-            metadata = result_data.get("metadata", {})
-            
-            print("\n=== 元数据信息 ===")
-            if metadata:
-                print(f"原始题目总数: {metadata.get('total_original_questions', 'N/A')}")
-                print(f"随机种子: {metadata.get('random_seed', 'N/A')}")
-        else:
-            converted_questions = result_data
-        
-        validation_stats = validate_converted_questions(converted_questions)
+        validation_stats = validate_converted_questions(result_data)
        
        print(f"\n=== 验证结果 ===")
        print(f"总题目数: {validation_stats['total']}")
@@ -412,85 +567,27 @@ def main():
        
        print(f"格式正确率: {validation_stats['valid']/validation_stats['total']*100:.1f}%")
        
+        # 验证最终答案分布
+        if BALANCE_ANSWERS:
+            print(f"\n=== 最终答案分布验证 ===")
+            final_answers = []
+            for q in result_data:
+                answer = extract_answer_from_question(q)
+                if answer:
+                    final_answers.append(answer)
+            
+            final_counter = Counter(final_answers)
+            total = len(final_answers)
+            
+            for answer in ["A", "B", "C", "D"]:
+                count = final_counter.get(answer, 0)
+                ratio = count / total if total > 0 else 0
+                print(f"  {answer}: {count} ({ratio*100:.1f}%)")
+        
    except Exception as e:
        print(f"程序执行失败: {e}")
        import traceback
        traceback.print_exc()

-def interactive_config():
-    """交互式配置选择比例"""
-    print("=== 交互式难度选择配置 ===")
-    
-    difficulties = ["hard_early_stop", "easy_all_correct", "mixed", "unknown"]
-    difficulty_names = {
-        "hard_early_stop": "困难题(答错早停)",
-        "easy_all_correct": "简单题(全部答对)",
-        "mixed": "混合题(部分对错)",
-        "unknown": "未知难度题"
-    }
-    
-    ratios = {}
-    
-    for diff in difficulties:
-        while True:
-            try:
-                ratio_input = input(f"请输入{difficulty_names.get(diff, diff)}的选择比例 (0-100%): ").strip()
-                if ratio_input.endswith('%'):
-                    ratio_input = ratio_input[:-1]
-                
-                ratio_percent = float(ratio_input)
-                if 0 <= ratio_percent <= 100:
-                    ratios[diff] = ratio_percent / 100.0
-                    break
-                else:
-                    print("请输入0-100之间的数值")
-            except ValueError:
-                print("请输入有效的数值")
-    
-    print("\n配置结果:")
-    for diff, ratio in ratios.items():
-        print(f"  {difficulty_names.get(diff, diff)}: {ratio*100:.1f}%")
-    
-    return ratios
-
-def test_difficulty_distribution(input_file: str):
-    """测试文件中的难度分布"""
-    print(f"正在分析文件难度分布: {input_file}")
-    
-    with open(input_file, 'r', encoding='utf-8') as f:
-        data = json.load(f)
-    
-    # 处理两种可能的输入格式
-    if isinstance(data, dict) and "questions" in data:
-        questions = data["questions"]
-    elif isinstance(data, list):
-        questions = data
-    else:
-        print("不支持的文件格式")
-        return
-    
-    difficulty_groups = classify_questions_by_difficulty(questions)
-    
-    print(f"\n难度分布分析:")
-    print(f"总题目数: {len(questions)}")
-    
-    for difficulty, question_list in difficulty_groups.items():
-        mc_count = sum(1 for q in question_list 
-                      if q.get("generated_options", {}).get("question_type") == "multiple_choice")
-        print(f"  {difficulty}:")
-        print(f"    总数: {len(question_list)}")
-        print(f"    单选题: {mc_count}")
-        print(f"    占比: {len(question_list)/len(questions)*100:.1f}%")
-
 if __name__ == "__main__":
-    # 可以先测试难度分布
-    # test_difficulty_distribution("/path/to/your/input/file.json")
-    
-    # 可以使用交互式配置
-    # ratios = interactive_config()
-    
-    # 运行主程序
    main()
-    
-    # 显示配置模板
-    # create_difficulty_config_template()