Files
MatBench/layer1/ALL-merge/merge.py
2025-05-28 10:55:34 +08:00

59 lines
2.5 KiB
Python

import os
import json
import glob
def merge_json_files(input_directory, output_file):
all_data = []
json_files = glob.glob(os.path.join(input_directory, "*.json"))
for json_file in json_files:
try:
with open(json_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 处理不同类型的JSON数据
if isinstance(data, list):
# 检查每个元素是否符合预期格式
for item in data:
if isinstance(item, dict) and 'question' in item and 'answer' in item:
# 确保数据格式统一
# standardized_item = {
# 'question': item['question'],
# 'answer': item['answer']
# }
all_data.append(item)
else:
print(f"警告: 跳过不符合格式的项 {item}")
elif isinstance(data, dict) and 'question' in data and 'answer' in data:
# 确保数据格式统一
# standardized_item = {
# 'question': data['question'],
# 'answer': data['answer']
# }
all_data.append(data)
else:
print(f"警告: 无法处理文件 {json_file} 中的数据,格式不符合预期")
print(f"成功处理: {json_file}")
except json.JSONDecodeError:
print(f"错误: 文件 {json_file} 不是有效的JSON格式")
except Exception as e:
print(f"处理文件 {json_file} 时出错: {str(e)}")
# 将合并后的数据写入输出文件
try:
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(all_data, f, indent=2, ensure_ascii=False)
print(f"成功将 {len(json_files)} 个JSON文件中的 {len(all_data)} 个问答对合并到 {output_file}")
except Exception as e:
print(f"写入输出文件时出错: {str(e)}")
# 使用示例
if __name__ == "__main__":
# 替换为你的实际目录和输出文件路径
input_dir = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge"
output_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge/merged.json"
merge_json_files(input_dir, output_file)