Files
wl-hydrophilic-polymer/code/merge_qa2jsonl.py
2025-05-08 11:32:28 +08:00

71 lines
3.0 KiB
Python

# import os
# import json
# import random
# # 设定目录路径
# directory_path = '/home/ubuntu/50T/fsy/wl/task1/task1-qa' # 替换为你的目录
# output_file_1 = '/home/ubuntu/50T/fsy/wl/task1/task1_train_dataset_new.jsonl' # 输出的第一个 jsonl 文件名
# output_file_2 = '/home/ubuntu/50T/fsy/wl/task1/task1_val_dataset_new.jsonl' # 输出的第二个 jsonl 文件名
# # 获取目录下所有 JSON 文件
# json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
# random.shuffle(json_files) # 打乱文件顺序
# # 随机选择 30 个文件
# num_random_files = 30
# if len(json_files) < num_random_files:
# num_random_files = len(json_files)
# random_files = json_files[:num_random_files]
# remaining_files = json_files[num_random_files:]
# # 保存到 jsonl 文件
# def save_to_jsonl(file_list, output_file):
# with open(output_file, 'w', encoding='utf-8') as f:
# for json_file in file_list:
# with open(os.path.join(directory_path, json_file), 'r', encoding='utf-8') as json_f:
# data = json.load(json_f)
# data_done = {"messages": [{"role": "system", "content": "You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers."}, {"role": "user", "content": data["design_question"]}, {"role": "assistant", "content": data["design_answer"]}]}
# # data_done = {"system":"You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers.", "conversation":[{"prompt":data["design_question"],"response":data["design_answer"]}]}
# f.write(json.dumps(data_done) + '\n')
# # 将文件保存到对应的 jsonl 文件
# save_to_jsonl(remaining_files, output_file_1)
# save_to_jsonl(random_files, output_file_2)
# print(f'已将 {len(remaining_files)} 个文件保存到 {output_file_1}')
# print(f'已将 {len(random_files)} 个文件保存到 {output_file_2}')
import json
import random
# 假设这是你的两个输入 JSONL 文件
input_file_1 = '/home/ubuntu/50T/fsy/wl/task1_val_dataset_new.jsonl' # 第一个 JSONL 文件名
input_file_2 = '/home/ubuntu/50T/fsy/wl/task2_val_dataset_new.jsonl' # 第二个 JSONL 文件名
output_file = '/home/ubuntu/50T/fsy/wl/val_dataset.jsonl' # 合并后输出的 JSONL 文件名
# 读取 JSONL 文件并存储到列表中
def read_jsonl(file_name):
data = []
with open(file_name, 'r', encoding='utf-8') as f:
for line in f:
data.append(json.loads(line))
return data
# 读取两个 JSONL 文件
data1 = read_jsonl(input_file_1)
data2 = read_jsonl(input_file_2)
# 合并数据
merged_data = data1 + data2
# 随机打乱合并后的数据
random.shuffle(merged_data)
# 将打乱后的数据写入新的 JSONL 文件
with open(output_file, 'w', encoding='utf-8') as f:
for entry in merged_data:
f.write(json.dumps(entry) + '\n')
print(f'已将 {len(merged_data)} 条记录合并并打乱,保存到 {output_file}')