Files
MatBench/layer2/process/step4.py
2025-05-28 11:00:24 +08:00

79 lines
3.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
对821道英文问题进行处理
1. 判断是否包含多个子问题,将问题拆分为完整子问题(去掉推理过程,只保留最后结果)
2. 判断题目类型
3. 将题目做成选择题
对计算题,在数值附近随机生成三个相似答案作为错误选项
对简答题,与标准答案最相近的其他问题的答案作为三个错误选项
4. 将正确和错误选项随机打乱生成ABCD选择题的模型
5. 添加prompt并将选择题包裹在[ANSWER]<answer>[/ANSWER]标签中
6. 模型打分
"""
import json
import random
from typing import List, Dict
def process_json_file(file_path: str) -> List[Dict]:
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data:
# 收集所有选项
options = [
item['answer'],
item.get('wrong_answers_1', ''),
item.get('wrong_answers_2', ''),
item.get('wrong_answers_3', '')
]
# 过滤掉空选项
options = [opt for opt in options if opt]
# 打乱选项
random.shuffle(options)
# 找出正确答案的位置
correct_answer_index = options.index(item['answer'])
correct_answer_letter = chr(65 + correct_answer_index) # A, B, C, or D
# 构建选项文本
options_text = ""
for i, option in enumerate(options):
letter = chr(65 + i) # A, B, C, or D
options_text += f"({letter}){option}"
if i < len(options) - 1:
options_text += " "
# 更新问题和答案
item['question'] = f"{"The following is a question about Fundamentals of Materials Science"}{item['question']} {options_text}{"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER].\nFor example, '[ANSWER]<answer>[/ANSWER]', where <answer> is comma- or space-separated list of the correct letters. Always answer in exactly this format of comma-separated letters between the two tags, even if you are unsure. We require this because we use automatic parsing."}"
item['answer'] = f"[ANSWER]{correct_answer_letter}[/ANSWER]"
# 删除原始的错误选项
if 'wrong_answers_1' in item:
del item['wrong_answers_1']
if 'wrong_answers_2' in item:
del item['wrong_answers_2']
if 'wrong_answers_3' in item:
del item['wrong_answers_3']
return data
def save_processed_data(data: List[Dict], output_path: str) -> None:
"""
保存处理后的数据到新的JSON文件
"""
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# 使用示例
if __name__ == "__main__":
input_file = "/home/ubuntu/50T/fsy/5_1.json" # 替换为你的输入文件路径
output_file = "output.json" # 替换为你想要的输出文件路径
try:
processed_data = process_json_file(input_file)
save_processed_data(processed_data, output_file)
print(f"处理完成!结果已保存到 {output_file}")
except Exception as e:
print(f"处理过程中出现错误: {e}")