MatBench/layer1/ALL/SciEval-process.py

import json

# 读取 JSON 文件
def transform_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    new_json=[]

    for item in data:
        question = item["question"]
        answer = item["answer"][0]

        parts = question.split("\n\nA.")
        new_question = parts[0].strip()
        options_block = "A." + parts[1]

        texts = []
        labels = []
        label_order = ["A", "B", "C", "D"]

        current_label_index = 0
        while current_label_index < len(label_order):
            current_label = label_order[current_label_index]
            next_label = label_order[current_label_index + 1] if current_label_index + 1 < len(label_order) else None

            if next_label:
                parts = options_block.split(f"\n{next_label}.")
                current_option = parts[0].replace(f"{current_label}.", "").strip()  # 去掉标签前缀并移除多余空格
                options_block = next_label + "." + parts[1]  # 剩余的内容保留
            else:
                current_option = options_block.replace(f"{current_label}.", "").strip()  # 如果没有下一个标签，移除\n\nAnswer:
                current_option = current_option.replace("\n\nAnswer:", "")


            # 添加当前选项到 texts 和 labels
            texts.append(current_option)
            labels.append(current_label)

            # 递增标签索引
            current_label_index += 1

        transformed_data = {
            "question": new_question,
            "choices": {
                "text": texts,
                "label": labels
                },
            "answer": f"[ANSWER]{answer}[/ANSWER]",
            "prompt": "You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER]<answer>[/ANSWER]'. We require this because we use automatic parsing."
        }
        new_json.append(transformed_data)

    return new_json

input_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/SciEval-valid-mat.json'
output_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/13-Scieval-val.json'
transformed_data = transform_json(input_path)

with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(transformed_data, f, ensure_ascii= False, indent=2)