Files
MatBench/layer1/ALL/sciq-process.py
2025-05-28 10:55:34 +08:00

53 lines
1.8 KiB
Python

import json
import random
def process_json_file(input_file, output_file):
new_json=[]
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
for item in data:
choices={}
options = [
item['correct_answer'],
item['distractor1'],
item['distractor2'],
item['distractor3']
]
# 打乱选项顺序
random.shuffle(options)
# 找出正确答案的位置
correct_index = options.index(item['correct_answer'])
correct_letter = chr(65 + correct_index) # 65是ASCII码中'A'的值
# 拼接选项到问题中
labels = []
for i, option in enumerate(options):
letter = chr(65 + i) # A, B, C, D
labels.append(letter)
choices['text']=options
choices['label'] =labels
transformed_data = {
"question": item["question"],
"choices":choices,
"answer": f"[ANSWER]{correct_letter}[/ANSWER]",
"prompt":"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."
}
new_json.append(transformed_data)
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(new_json, f, indent=2, ensure_ascii=False)
return new_json
# 示例使用
if __name__ == "__main__":
input_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/sciq-val-mat.json"
output_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/15-sciq-val.json"
processed_data = process_json_file(input_file, output_file)