59 lines
2.5 KiB
Python
59 lines
2.5 KiB
Python
import os
|
|
import json
|
|
import glob
|
|
|
|
def merge_json_files(input_directory, output_file):
|
|
|
|
all_data = []
|
|
|
|
json_files = glob.glob(os.path.join(input_directory, "*.json"))
|
|
|
|
for json_file in json_files:
|
|
try:
|
|
with open(json_file, 'r', encoding='utf-8') as f:
|
|
data = json.load(f)
|
|
|
|
# 处理不同类型的JSON数据
|
|
if isinstance(data, list):
|
|
# 检查每个元素是否符合预期格式
|
|
for item in data:
|
|
if isinstance(item, dict) and 'question' in item and 'answer' in item:
|
|
# 确保数据格式统一
|
|
# standardized_item = {
|
|
# 'question': item['question'],
|
|
# 'answer': item['answer']
|
|
# }
|
|
all_data.append(item)
|
|
else:
|
|
print(f"警告: 跳过不符合格式的项 {item}")
|
|
elif isinstance(data, dict) and 'question' in data and 'answer' in data:
|
|
# 确保数据格式统一
|
|
# standardized_item = {
|
|
# 'question': data['question'],
|
|
# 'answer': data['answer']
|
|
# }
|
|
all_data.append(data)
|
|
else:
|
|
print(f"警告: 无法处理文件 {json_file} 中的数据,格式不符合预期")
|
|
|
|
print(f"成功处理: {json_file}")
|
|
except json.JSONDecodeError:
|
|
print(f"错误: 文件 {json_file} 不是有效的JSON格式")
|
|
except Exception as e:
|
|
print(f"处理文件 {json_file} 时出错: {str(e)}")
|
|
|
|
# 将合并后的数据写入输出文件
|
|
try:
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|
json.dump(all_data, f, indent=2, ensure_ascii=False)
|
|
print(f"成功将 {len(json_files)} 个JSON文件中的 {len(all_data)} 个问答对合并到 {output_file}")
|
|
except Exception as e:
|
|
print(f"写入输出文件时出错: {str(e)}")
|
|
|
|
# 使用示例
|
|
if __name__ == "__main__":
|
|
# 替换为你的实际目录和输出文件路径
|
|
input_dir = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge"
|
|
output_file = "/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge/merged.json"
|
|
|
|
merge_json_files(input_dir, output_file) |