格式转换

This commit is contained in:
lzy
2025-05-29 20:18:57 +08:00
parent 1156bfdd7c
commit 6c87af5614
14 changed files with 11996 additions and 13 deletions

View File

@@ -0,0 +1,244 @@
import json
from typing import Dict, Any, List, Optional
def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
将源JSON格式转换为目标格式
Args:
source_data: 源数据字典
Returns:
转换后的数据字典如果不是单选题则返回None
"""
# 检查是否有generated_options字段
if "generated_options" not in source_data:
return None
generated_options = source_data["generated_options"]
# 只处理单选题,跳过判断题
if generated_options.get("question_type") != "multiple_choice":
return None
# 获取题目内容
question = source_data.get("choice_question", "")
if not question:
return None
# 获取选项
options = generated_options.get("options", {})
if len(options) != 4:
return None
# 获取正确答案
correct_answer = generated_options.get("correct_answer", "")
if correct_answer not in ["A", "B", "C", "D"]:
return None
# 构建目标格式
target_data = {
"question": question,
"choices": {
"text": [
options.get("A", ""),
options.get("B", ""),
options.get("C", ""),
options.get("D", "")
],
"label": ["A", "B", "C", "D"]
},
"answer": f"[ANSWER]{correct_answer}[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
}
return target_data
def batch_convert_questions(input_file: str, output_file: str) -> None:
"""
批量转换题目格式
Args:
input_file: 输入文件路径
output_file: 输出文件路径
"""
print("正在加载数据...")
with open(input_file, 'r', encoding='utf-8') as f:
source_questions = json.load(f)
print(f"加载了 {len(source_questions)} 道题目")
converted_questions = []
conversion_stats = {
"total": len(source_questions),
"multiple_choice": 0,
"true_false": 0,
"other": 0,
"converted": 0,
"failed": 0
}
for i, question in enumerate(source_questions):
try:
# 统计题目类型
generated_options = question.get("generated_options", {})
question_type = generated_options.get("question_type", "unknown")
if question_type == "multiple_choice":
conversion_stats["multiple_choice"] += 1
elif question_type == "true_false":
conversion_stats["true_false"] += 1
else:
conversion_stats["other"] += 1
# 转换题目
converted = convert_to_target_format(question)
if converted:
converted_questions.append(converted)
conversion_stats["converted"] += 1
else:
conversion_stats["failed"] += 1
except Exception as e:
print(f"{i+1}题转换失败: {e}")
conversion_stats["failed"] += 1
# 保存结果
print("正在保存转换结果...")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(converted_questions, f, ensure_ascii=False, indent=2)
# 打印统计信息
print(f"\n转换完成!")
print(f"总题目数: {conversion_stats['total']}")
print(f"单选题: {conversion_stats['multiple_choice']}")
print(f"判断题: {conversion_stats['true_false']}")
print(f"其他类型: {conversion_stats['other']}")
print(f"成功转换: {conversion_stats['converted']}")
print(f"转换失败: {conversion_stats['failed']}")
print(f"转换率: {conversion_stats['converted']/conversion_stats['total']*100:.1f}%")
print(f"结果已保存到: {output_file}")
def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, int]:
"""
验证转换后的题目格式
Args:
questions: 转换后的题目列表
Returns:
验证统计信息
"""
stats = {
"total": len(questions),
"valid": 0,
"invalid": 0,
"missing_question": 0,
"invalid_choices": 0,
"invalid_answer": 0
}
for i, q in enumerate(questions):
is_valid = True
# 检查question字段
if not q.get("question", "").strip():
stats["missing_question"] += 1
is_valid = False
# 检查choices字段
choices = q.get("choices", {})
text_list = choices.get("text", [])
label_list = choices.get("label", [])
if (len(text_list) != 4 or len(label_list) != 4 or
label_list != ["A", "B", "C", "D"] or
any(not str(text).strip() for text in text_list)):
stats["invalid_choices"] += 1
is_valid = False
# 检查answer字段
answer = q.get("answer", "")
if not (answer.startswith("[ANSWER]") and answer.endswith("[/ANSWER]") and
answer[8:-9] in ["A", "B", "C", "D"]):
stats["invalid_answer"] += 1
is_valid = False
if is_valid:
stats["valid"] += 1
else:
stats["invalid"] += 1
print(f"{i+1}题格式无效")
return stats
def main():
"""主函数"""
# 文件路径配置
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions.json"
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
try:
# 批量转换
batch_convert_questions(INPUT_FILE, OUTPUT_FILE)
# 验证转换结果
print("\n正在验证转换结果...")
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
converted_questions = json.load(f)
validation_stats = validate_converted_questions(converted_questions)
print(f"\n验证结果:")
print(f"总题目数: {validation_stats['total']}")
print(f"格式正确: {validation_stats['valid']}")
print(f"格式错误: {validation_stats['invalid']}")
if validation_stats['invalid'] > 0:
print(f" 缺少题目: {validation_stats['missing_question']}")
print(f" 选项格式错误: {validation_stats['invalid_choices']}")
print(f" 答案格式错误: {validation_stats['invalid_answer']}")
print(f"格式正确率: {validation_stats['valid']/validation_stats['total']*100:.1f}%")
except Exception as e:
print(f"程序执行失败: {e}")
def test_single_conversion():
"""测试单个题目转换"""
# 测试数据
test_data = {
"idx": 3154,
"question": "In stable ZrO2 material, cations form an fcc structure, and anions occupy tetrahedral interstitial sites. If 20 mol% CaO is added, calculate the percentage of occupied tetrahedral interstitial sites.",
"answer": "Zr4+ and Ca2+ cations occupy the face-centered cubic lattice sites. 100 cations can form 25 unit cells, with a total of 25×8=200 tetrahedral interstitial sites. Therefore, the percentage of occupied tetrahedral interstitial sites is 180÷200=90%.",
"question_type": "calculation",
"correct_option": "90%",
"choice_question": "In stable ZrO2 material, cations form an fcc structure, and anions occupy tetrahedral interstitial sites. If 20 mol% CaO is added, what is the percentage of occupied tetrahedral interstitial sites?",
"generated_options": {
"question_type": "multiple_choice",
"options": {
"A": "80%",
"B": "90%",
"C": "50%",
"D": "75%"
},
"correct_answer": "B",
"explanation": "正确答案90%基于1) fcc中四面体间隙数量是阳离子的2倍2) 20 mol% CaO掺杂产生20%氧空位3) 被占据间隙位比例=(原始占据数-空位数)/总间隙位数。"
},
"generation_status": "success"
}
# 测试转换
result = convert_to_target_format(test_data)
if result:
print("转换成功!")
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print("转换失败!")
if __name__ == "__main__":
# 可以先运行测试
# test_single_conversion()
# 运行主程序
main()