MatBench/layer2/process/step0.py

#step0: 将文本拆分成问题和答案两部分
import json

input_file_path = '/home/ubuntu/50T/fsy/benchmark/dataset-ours/[Solution]qa_segment_all.json'
with open(input_file_path, 'r', encoding='utf-8') as infile:
    data = json.load(infile)

# 遍历并处理数据
processed_data = []
for item in data:
    segment = item.get("segment", "")
    if "Solution" in segment:
        question, answer = segment.split("Solution", 1)
        question = question.strip()
        answer = answer.strip()
        processed_data.append({
            "idx": item.get("idx"),
            "question": question,
            "answer": answer,
        })
    elif "Answer" in segment:
        question, answer = segment.split("Answer", 1)
        question = question.strip()
        answer = answer.strip()
        processed_data.append({
            "idx": item.get("idx"),
            "question": question,
            "answer": answer,
        })
    else:
        processed_data.append({
            "idx": item.get("idx"),
            "question": "000",
            "answer": "000",
        })

output_file_path = '[Solution]qa_segment.json'
with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(processed_data, outfile, ensure_ascii=False, indent=4)

output_file_path