# import os # import json # import random # # 设定目录路径 # directory_path = '/home/ubuntu/50T/fsy/wl/task1/task1-qa' # 替换为你的目录 # output_file_1 = '/home/ubuntu/50T/fsy/wl/task1/task1_train_dataset_new.jsonl' # 输出的第一个 jsonl 文件名 # output_file_2 = '/home/ubuntu/50T/fsy/wl/task1/task1_val_dataset_new.jsonl' # 输出的第二个 jsonl 文件名 # # 获取目录下所有 JSON 文件 # json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')] # random.shuffle(json_files) # 打乱文件顺序 # # 随机选择 30 个文件 # num_random_files = 30 # if len(json_files) < num_random_files: # num_random_files = len(json_files) # random_files = json_files[:num_random_files] # remaining_files = json_files[num_random_files:] # # 保存到 jsonl 文件 # def save_to_jsonl(file_list, output_file): # with open(output_file, 'w', encoding='utf-8') as f: # for json_file in file_list: # with open(os.path.join(directory_path, json_file), 'r', encoding='utf-8') as json_f: # data = json.load(json_f) # data_done = {"messages": [{"role": "system", "content": "You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers."}, {"role": "user", "content": data["design_question"]}, {"role": "assistant", "content": data["design_answer"]}]} # # data_done = {"system":"You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers.", "conversation":[{"prompt":data["design_question"],"response":data["design_answer"]}]} # f.write(json.dumps(data_done) + '\n') # # 将文件保存到对应的 jsonl 文件 # save_to_jsonl(remaining_files, output_file_1) # save_to_jsonl(random_files, output_file_2) # print(f'已将 {len(remaining_files)} 个文件保存到 {output_file_1}') # print(f'已将 {len(random_files)} 个文件保存到 {output_file_2}') import json import random # 假设这是你的两个输入 JSONL 文件 input_file_1 = '/home/ubuntu/50T/fsy/wl/task1_val_dataset_new.jsonl' # 第一个 JSONL 文件名 input_file_2 = '/home/ubuntu/50T/fsy/wl/task2_val_dataset_new.jsonl' # 第二个 JSONL 文件名 output_file = '/home/ubuntu/50T/fsy/wl/val_dataset.jsonl' # 合并后输出的 JSONL 文件名 # 读取 JSONL 文件并存储到列表中 def read_jsonl(file_name): data = [] with open(file_name, 'r', encoding='utf-8') as f: for line in f: data.append(json.loads(line)) return data # 读取两个 JSONL 文件 data1 = read_jsonl(input_file_1) data2 = read_jsonl(input_file_2) # 合并数据 merged_data = data1 + data2 # 随机打乱合并后的数据 random.shuffle(merged_data) # 将打乱后的数据写入新的 JSONL 文件 with open(output_file, 'w', encoding='utf-8') as f: for entry in merged_data: f.write(json.dumps(entry) + '\n') print(f'已将 {len(merged_data)} 条记录合并并打乱,保存到 {output_file}')