MatBench/layer1/ALL/sciq-process.py

import json
import random

def process_json_file(input_file, output_file):
    new_json=[]
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    for item in data:
        choices={}
        options = [
            item['correct_answer'],
            item['distractor1'],
            item['distractor2'],
            item['distractor3']
        ]
        # 打乱选项顺序
        random.shuffle(options)

        # 找出正确答案的位置
        correct_index = options.index(item['correct_answer'])
        correct_letter = chr(65 + correct_index)  # 65是ASCII码中'A'的值

        # 拼接选项到问题中
        labels = []
        for i, option in enumerate(options):
            letter = chr(65 + i)  # A, B, C, D
            labels.append(letter)

        choices['text']=options
        choices['label'] =labels

        transformed_data = {
            "question": item["question"],
            "choices":choices,
            "answer": f"[ANSWER]{correct_letter}[/ANSWER]",
            "prompt":"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."

        }
        new_json.append(transformed_data)

    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(new_json, f, indent=2, ensure_ascii=False)

    return new_json

# 示例使用
if __name__ == "__main__":

    input_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/sciq-val-mat.json"
    output_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/15-sciq-val.json"

    processed_data = process_json_file(input_file, output_file)