MatBench/layer1/ALL/QASC-process.py

import json

# 读取 JSON 文件
def transform_choices(old_choices):
    # 提取所有 "text" 值
    text_list = [choice["text"] for choice in old_choices]
    # 提取所有 "label" 值
    label_list = [choice["label"] for choice in old_choices]

    return {
        "text": text_list,
        "label": label_list
    }

def transform_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_json=[]

    for item in data:
        question = item["question"]["stem"]
        choices = item["question"]["choices"]
        answer_index = item["answerKey"]

        new_choices =transform_choices(choices)

        # 构造新的 JSON 数据
        transformed_data = {
            "question": question,
            "choices":new_choices,
            "answer": f"[ANSWER]{answer_index}[/ANSWER]",
            "prompt":"You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."

        }
        new_json.append(transformed_data)

    return new_json

input_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/QASC-dev-mat.json'
output_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/10-QASC.json'
transformed_data = transform_json(input_path)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii= False, indent=2)