import json # 读取 JSON 文件 def transform_json(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) new_json=[] for item in data: question = item["question"] answer = item["answer"][0] parts = question.split("\n\nA.") new_question = parts[0].strip() options_block = "A." + parts[1] texts = [] labels = [] label_order = ["A", "B", "C", "D"] current_label_index = 0 while current_label_index < len(label_order): current_label = label_order[current_label_index] next_label = label_order[current_label_index + 1] if current_label_index + 1 < len(label_order) else None if next_label: parts = options_block.split(f"\n{next_label}.") current_option = parts[0].replace(f"{current_label}.", "").strip() # 去掉标签前缀并移除多余空格 options_block = next_label + "." + parts[1] # 剩余的内容保留 else: current_option = options_block.replace(f"{current_label}.", "").strip() # 如果没有下一个标签,移除\n\nAnswer: current_option = current_option.replace("\n\nAnswer:", "") # 添加当前选项到 texts 和 labels texts.append(current_option) labels.append(current_label) # 递增标签索引 current_label_index += 1 transformed_data = { "question": new_question, "choices": { "text": texts, "label": labels }, "answer": f"[ANSWER]{answer}[/ANSWER]", "prompt": "You MUST include the letter(s) of the correct answer (separated by comma if there are many) within the following tags: [ANSWER] and [/ANSWER]. No explanations and other information. Only return the '[ANSWER][/ANSWER]'. We require this because we use automatic parsing." } new_json.append(transformed_data) return new_json input_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/SciEval-valid-mat.json' output_path = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL/13-Scieval-val.json' transformed_data = transform_json(input_path) with open(output_path, 'w', encoding='utf-8') as f: json.dump(transformed_data, f, ensure_ascii= False, indent=2)