调整格式转换的代码:清理无用逻辑并加入选项平衡;

This commit is contained in:
lzy
2025-06-02 17:17:42 +08:00
parent abeacaac3e
commit 7a725bc003
2 changed files with 38118 additions and 629 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -1,43 +1,33 @@
import json
from typing import Dict, Any, List, Optional, Tuple
import random
from collections import Counter
def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
将源JSON格式转换为目标格式
Args:
source_data: 源数据字典
Returns:
转换后的数据字典如果不是单选题则返回None
"""
# 检查是否有generated_options字段
if "generated_options" not in source_data:
return None
generated_options = source_data["generated_options"]
# 只处理单选题,跳过判断题
# 只处理单选题
if generated_options.get("question_type") != "multiple_choice":
return None
# 获取题目内容
question = source_data.get("choice_question", "")
if not question:
return None
# 获取选项
options = generated_options.get("options", {})
if len(options) != 4:
return None
# 获取正确答案
correct_answer = generated_options.get("correct_answer", "")
if correct_answer not in ["A", "B", "C", "D"]:
return None
# 构建目标格式
target_data = {
"question": question,
"choices": {
@@ -55,6 +45,188 @@ def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str,
return target_data
def extract_answer_from_question(question: Dict[str, Any]) -> Optional[str]:
"""从转换后的题目中提取答案选项"""
answer_text = question.get("answer", "")
if answer_text.startswith("[ANSWER]") and answer_text.endswith("[/ANSWER]"):
answer = answer_text[8:-9]
if answer in ["A", "B", "C", "D"]:
return answer
return None
def shuffle_question_options(question: Dict[str, Any], new_correct_answer: str) -> Dict[str, Any]:
"""
重新排列题目选项,使正确答案变为指定选项
Args:
question: 题目字典
new_correct_answer: 新的正确答案选项 (A/B/C/D)
Returns:
重新排列后的题目
"""
# 获取当前正确答案
current_answer = extract_answer_from_question(question)
if not current_answer:
return question
# 如果已经是目标答案,不需要改变
if current_answer == new_correct_answer:
return question
# 获取当前选项
choices = question.get("choices", {})
current_texts = choices.get("text", [])
current_labels = choices.get("label", ["A", "B", "C", "D"])
if len(current_texts) != 4 or len(current_labels) != 4:
return question
# 找到当前正确答案的索引
current_index = current_labels.index(current_answer)
new_index = current_labels.index(new_correct_answer)
# 交换选项
new_texts = current_texts[:]
new_texts[new_index], new_texts[current_index] = new_texts[current_index], new_texts[new_index]
# 创建新的题目
new_question = question.copy()
new_question["choices"] = {
"text": new_texts,
"label": ["A", "B", "C", "D"]
}
new_question["answer"] = f"[ANSWER]{new_correct_answer}[/ANSWER]"
return new_question
def balance_answer_distribution_by_shuffling(questions: List[Dict[str, Any]],
random_seed: Optional[int] = None) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
通过重新排列选项来平衡答案分布
Args:
questions: 题目列表
random_seed: 随机种子
Returns:
平衡后的题目列表和统计信息
"""
if random_seed is not None:
random.seed(random_seed)
total_questions = len(questions)
target_per_answer = total_questions // 4
remainder = total_questions % 4
print(f"\n=== 答案分布平衡 (重排选项法) ===")
print(f"总题目数: {total_questions}")
print(f"标准分配: 每个选项 {target_per_answer} 道题")
if remainder > 0:
print(f"余数: {remainder} 道题 (将分配给前{remainder}个选项)")
# 统计当前答案分布
answer_groups = {"A": [], "B": [], "C": [], "D": []}
for i, question in enumerate(questions):
answer = extract_answer_from_question(question)
if answer and answer in answer_groups:
answer_groups[answer].append((i, question))
print(f"\n当前答案分布:")
for answer in ["A", "B", "C", "D"]:
count = len(answer_groups[answer])
ratio = count / total_questions if total_questions > 0 else 0
print(f" {answer}: {count} ({ratio*100:.1f}%)")
# 计算目标分配前remainder个选项多分配1道题
target_counts = {}
for i, answer in enumerate(["A", "B", "C", "D"]):
if i < remainder:
target_counts[answer] = target_per_answer + 1
else:
target_counts[answer] = target_per_answer
print(f"\n目标分配:")
for answer in ["A", "B", "C", "D"]:
print(f" {answer}: {target_counts[answer]} 道题")
# 计算需要调整的数量
surplus_questions = [] # (question_index, question, from_answer)
deficit_needed = [] # (to_answer, count_needed)
for answer in ["A", "B", "C", "D"]:
current_count = len(answer_groups[answer])
target_count = target_counts[answer]
difference = current_count - target_count
if difference > 0:
# 有多余的题目,需要转移出去
print(f" {answer}: 多 {difference} 道题")
# 随机选择要转移的题目
questions_to_move = random.sample(answer_groups[answer], difference)
for q_idx, q in questions_to_move:
surplus_questions.append((q_idx, q, answer))
elif difference < 0:
# 缺少题目,需要接收
needed = -difference
print(f" {answer}: 少 {needed} 道题")
deficit_needed.extend([(answer, 1)] * needed)
# 打乱顺序以避免偏向性
random.shuffle(surplus_questions)
random.shuffle(deficit_needed)
# 执行调整
balanced_questions = questions[:] # 复制原题目列表
print(f"\n开始重新分配 {len(surplus_questions)} 道题:")
for i, ((q_idx, question, from_answer), (to_answer, _)) in enumerate(zip(surplus_questions, deficit_needed)):
# 重新排列这道题的选项
new_question = shuffle_question_options(question, to_answer)
balanced_questions[q_idx] = new_question
print(f"{i+1}次调整: 题目{q_idx+1} 答案从 {from_answer} 改为 {to_answer}")
# 验证最终分布
final_counter = Counter()
for question in balanced_questions:
answer = extract_answer_from_question(question)
if answer:
final_counter[answer] += 1
print(f"\n平衡后答案分布:")
max_deviation = 0
target_ratio = 0.25
for answer in ["A", "B", "C", "D"]:
count = final_counter.get(answer, 0)
ratio = count / total_questions if total_questions > 0 else 0
deviation = abs(ratio - target_ratio)
max_deviation = max(max_deviation, deviation)
print(f" {answer}: {count} ({ratio*100:.1f}%)")
# 统计信息
balance_info = {
"original_total": total_questions,
"final_total": total_questions, # 题目总数不变
"target_per_answer": target_per_answer,
"remainder": remainder,
"final_distribution": dict(final_counter),
"max_deviation": max_deviation,
"adjustments_made": len(surplus_questions),
"perfectly_balanced": max_deviation <= 0.05
}
if balance_info["perfectly_balanced"]:
print(f"✅ 完美平衡!最大偏差: {max_deviation*100:.1f}%")
else:
print(f"📊 接近平衡,最大偏差: {max_deviation*100:.1f}%")
print(f"总共调整了 {balance_info['adjustments_made']} 道题的答案")
return balanced_questions, balance_info
def classify_questions_by_difficulty(questions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
按难度分类题目
@@ -142,29 +314,34 @@ def select_questions_by_ratio(difficulty_groups: Dict[str, List[Dict[str, Any]]]
def batch_convert_questions_with_difficulty_filter(input_file: str,
output_file: str,
selection_ratios: Dict[str, float],
balance_answers: bool = True,
random_seed: Optional[int] = None) -> None:
"""
批量转换题目格式,支持按难度筛选
批量转换题目格式,支持按难度筛选和答案平衡
Args:
input_file: 输入文件路径
output_file: 输出文件路径
selection_ratios: 各难度等级的选择比例
balance_answers: 是否平衡答案分布
random_seed: 随机种子
"""
print("正在加载数据...")
print("=== 批量转换题目(难度筛选 + 答案平衡)===")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"答案平衡: {'开启' if balance_answers else '关闭'}")
print(f"随机种子: {random_seed}")
# 判断输入文件格式
# 加载数据
print("\n正在加载数据...")
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 处理两种可能的输入格式
if isinstance(data, dict) and "questions" in data:
# 格式:{"questions": [...], "其他字段": ...}
source_questions = data["questions"]
print(f"检测到完整格式数据,包含其他元数据")
elif isinstance(data, list):
# 格式:[{题目1}, {题目2}, ...]
source_questions = data
print(f"检测到题目列表格式")
else:
@@ -173,7 +350,7 @@ def batch_convert_questions_with_difficulty_filter(input_file: str,
print(f"加载了 {len(source_questions)} 道题目")
# 按难度分类题目
print("正在按难度分类题目...")
print("\n正在按难度分类题目...")
difficulty_groups = classify_questions_by_difficulty(source_questions)
print("题目难度分布:")
@@ -247,42 +424,48 @@ def batch_convert_questions_with_difficulty_filter(input_file: str,
print(f"{i+1}题转换失败: {e}")
conversion_stats["failed"] += 1
print(f"转换完成: {conversion_stats['converted']} 道题目成功转换")
# 对转换后的题目进行答案分布平衡
balance_info = None
if balance_answers and converted_questions:
print("\n正在对转换后的题目进行答案分布平衡...")
balanced_questions, balance_info = balance_answer_distribution_by_shuffling(
converted_questions,
random_seed=random_seed
)
converted_questions = balanced_questions
conversion_stats["final_count"] = len(converted_questions)
# 保存结果
print("正在保存转换结果...")
output_data = {
"questions": converted_questions,
"metadata": {
"total_original_questions": len(source_questions),
"selection_ratios": selection_ratios,
"selection_stats": selection_stats,
"conversion_stats": conversion_stats,
"random_seed": random_seed
}
}
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(converted_questions, f, ensure_ascii=False, indent=2)
# 打印最终统计信息
print(f"\n转换完成!")
print(f"\n=== 转换完成!===")
print(f"选中题目数: {conversion_stats['selected']}")
print(f"单选题: {conversion_stats['multiple_choice']}")
print(f"判断题: {conversion_stats['true_false']}")
print(f"其他类型: {conversion_stats['other']}")
print(f"成功转换: {conversion_stats['converted']}")
print(f"转换失败: {conversion_stats['failed']}")
print(f"最终转换率: {conversion_stats['converted']/conversion_stats['selected']*100:.1f}%")
if balance_answers and balance_info:
print(f"答案平衡后: {conversion_stats.get('final_count', conversion_stats['converted'])}")
print(f"调整题目数: {balance_info['adjustments_made']}")
print(f"最终转换率: {conversion_stats.get('final_count', conversion_stats['converted'])/conversion_stats['selected']*100:.1f}%")
else:
print(f"最终转换率: {conversion_stats['converted']/conversion_stats['selected']*100:.1f}%")
print(f"结果已保存到: {output_file}")
def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, int]:
"""
验证转换后的题目格式
Args:
questions: 转换后的题目列表
Returns:
验证统计信息
"""
stats = {
"total": len(questions),
@@ -323,29 +506,9 @@ def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, i
stats["valid"] += 1
else:
stats["invalid"] += 1
print(f"{i+1}题格式无效")
return stats
def create_difficulty_config_template():
"""创建难度配置模板"""
template = {
"hard_early_stop": 1.0, # 困难题选择100%
"easy_all_correct": 0.1, # 简单题选择10%
"mixed": 0.5, # 混合题选择50%
"unknown": 0.0 # 未知难度题目选择0%
}
print("难度选择比例配置模板:")
print(json.dumps(template, indent=2))
print("\n说明:")
print("- 1.0 = 100% (全部选择)")
print("- 0.5 = 50% (选择一半)")
print("- 0.1 = 10% (选择10%)")
print("- 0.0 = 0% (不选择)")
return template
def main():
"""主函数"""
# 文件路径配置
@@ -353,17 +516,19 @@ def main():
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
# 难度选择比例配置
# 可以根据需要调整这些比例
SELECTION_RATIOS = {
"hard_early_stop": 1.0, # 困难题选择100% (全部)
"easy_all_correct": 0.0, # 简单题选择10%
"mixed": 0.0, # 混合题选择30%
"hard_early_stop": 1.0, # 困难题选择10%
"easy_all_correct": 0.35, # 简单题选择3.5%
"mixed": 0.0, # 混合题选择0%
"unknown": 0.0 # 未知难度不选择
}
# 随机种子,保证结果可复现
RANDOM_SEED = 42
# 是否启用答案平衡
BALANCE_ANSWERS = True
try:
# 显示配置信息
print("=== 难度筛选配置 ===")
@@ -371,14 +536,16 @@ def main():
for difficulty, ratio in SELECTION_RATIOS.items():
print(f" {difficulty}: {ratio*100:.1f}%")
print(f"随机种子: {RANDOM_SEED}")
print(f"启用答案平衡: {BALANCE_ANSWERS}")
print()
# 批量转换(包含难度筛选)
# 批量转换(包含难度筛选和答案平衡
batch_convert_questions_with_difficulty_filter(
INPUT_FILE,
OUTPUT_FILE,
SELECTION_RATIOS,
RANDOM_SEED
balance_answers=BALANCE_ANSWERS,
random_seed=RANDOM_SEED
)
# 验证转换结果
@@ -386,19 +553,7 @@ def main():
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
result_data = json.load(f)
# 检查输出文件格式
if "questions" in result_data:
converted_questions = result_data["questions"]
metadata = result_data.get("metadata", {})
print("\n=== 元数据信息 ===")
if metadata:
print(f"原始题目总数: {metadata.get('total_original_questions', 'N/A')}")
print(f"随机种子: {metadata.get('random_seed', 'N/A')}")
else:
converted_questions = result_data
validation_stats = validate_converted_questions(converted_questions)
validation_stats = validate_converted_questions(result_data)
print(f"\n=== 验证结果 ===")
print(f"总题目数: {validation_stats['total']}")
@@ -412,85 +567,27 @@ def main():
print(f"格式正确率: {validation_stats['valid']/validation_stats['total']*100:.1f}%")
# 验证最终答案分布
if BALANCE_ANSWERS:
print(f"\n=== 最终答案分布验证 ===")
final_answers = []
for q in result_data:
answer = extract_answer_from_question(q)
if answer:
final_answers.append(answer)
final_counter = Counter(final_answers)
total = len(final_answers)
for answer in ["A", "B", "C", "D"]:
count = final_counter.get(answer, 0)
ratio = count / total if total > 0 else 0
print(f" {answer}: {count} ({ratio*100:.1f}%)")
except Exception as e:
print(f"程序执行失败: {e}")
import traceback
traceback.print_exc()
def interactive_config():
"""交互式配置选择比例"""
print("=== 交互式难度选择配置 ===")
difficulties = ["hard_early_stop", "easy_all_correct", "mixed", "unknown"]
difficulty_names = {
"hard_early_stop": "困难题(答错早停)",
"easy_all_correct": "简单题(全部答对)",
"mixed": "混合题(部分对错)",
"unknown": "未知难度题"
}
ratios = {}
for diff in difficulties:
while True:
try:
ratio_input = input(f"请输入{difficulty_names.get(diff, diff)}的选择比例 (0-100%): ").strip()
if ratio_input.endswith('%'):
ratio_input = ratio_input[:-1]
ratio_percent = float(ratio_input)
if 0 <= ratio_percent <= 100:
ratios[diff] = ratio_percent / 100.0
break
else:
print("请输入0-100之间的数值")
except ValueError:
print("请输入有效的数值")
print("\n配置结果:")
for diff, ratio in ratios.items():
print(f" {difficulty_names.get(diff, diff)}: {ratio*100:.1f}%")
return ratios
def test_difficulty_distribution(input_file: str):
"""测试文件中的难度分布"""
print(f"正在分析文件难度分布: {input_file}")
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 处理两种可能的输入格式
if isinstance(data, dict) and "questions" in data:
questions = data["questions"]
elif isinstance(data, list):
questions = data
else:
print("不支持的文件格式")
return
difficulty_groups = classify_questions_by_difficulty(questions)
print(f"\n难度分布分析:")
print(f"总题目数: {len(questions)}")
for difficulty, question_list in difficulty_groups.items():
mc_count = sum(1 for q in question_list
if q.get("generated_options", {}).get("question_type") == "multiple_choice")
print(f" {difficulty}:")
print(f" 总数: {len(question_list)}")
print(f" 单选题: {mc_count}")
print(f" 占比: {len(question_list)/len(questions)*100:.1f}%")
if __name__ == "__main__":
# 可以先测试难度分布
# test_difficulty_distribution("/path/to/your/input/file.json")
# 可以使用交互式配置
# ratios = interactive_config()
# 运行主程序
main()
# 显示配置模板
# create_difficulty_config_template()