first commit code

2025-05-08 11:32:28 +08:00
commit bccb974250
9 changed files with 15155 additions and 0 deletions
--- a/code/merge_qa2jsonl.py
+++ b/code/merge_qa2jsonl.py
@@ -0,0 +1,71 @@
+# import os
+# import json
+# import random
+
+# # 设定目录路径
+# directory_path = '/home/ubuntu/50T/fsy/wl/task1/task1-qa'  # 替换为你的目录
+# output_file_1 = '/home/ubuntu/50T/fsy/wl/task1/task1_train_dataset_new.jsonl'    # 输出的第一个 jsonl 文件名
+# output_file_2 = '/home/ubuntu/50T/fsy/wl/task1/task1_val_dataset_new.jsonl'    # 输出的第二个 jsonl 文件名
+
+# # 获取目录下所有 JSON 文件
+# json_files = [f for f in os.listdir(directory_path) if f.endswith('.json')]
+# random.shuffle(json_files)  # 打乱文件顺序
+
+# # 随机选择 30 个文件
+# num_random_files = 30
+# if len(json_files) < num_random_files:
+#     num_random_files = len(json_files)
+
+# random_files = json_files[:num_random_files]
+# remaining_files = json_files[num_random_files:]
+
+# # 保存到 jsonl 文件
+# def save_to_jsonl(file_list, output_file):
+#     with open(output_file, 'w', encoding='utf-8') as f:
+#         for json_file in file_list:
+#             with open(os.path.join(directory_path, json_file), 'r', encoding='utf-8') as json_f:
+#                 data = json.load(json_f)
+#                 data_done = {"messages": [{"role": "system", "content": "You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers."}, {"role": "user", "content": data["design_question"]}, {"role": "assistant", "content": data["design_answer"]}]}
+#                 # data_done = {"system":"You are a seasoned professor in the field of materials science, with primary research focusing on the monomer synthesis of hydrophilic polymers.", "conversation":[{"prompt":data["design_question"],"response":data["design_answer"]}]}
+#                 f.write(json.dumps(data_done) + '\n')
+
+# # 将文件保存到对应的 jsonl 文件
+# save_to_jsonl(remaining_files, output_file_1)
+# save_to_jsonl(random_files, output_file_2)
+
+# print(f'已将 {len(remaining_files)} 个文件保存到 {output_file_1}')
+# print(f'已将 {len(random_files)} 个文件保存到 {output_file_2}')
+
+
+import json
+import random
+
+# 假设这是你的两个输入 JSONL 文件
+input_file_1 = '/home/ubuntu/50T/fsy/wl/task1_val_dataset_new.jsonl'  # 第一个 JSONL 文件名
+input_file_2 = '/home/ubuntu/50T/fsy/wl/task2_val_dataset_new.jsonl'  # 第二个 JSONL 文件名
+output_file = '/home/ubuntu/50T/fsy/wl/val_dataset.jsonl'    # 合并后输出的 JSONL 文件名
+
+# 读取 JSONL 文件并存储到列表中
+def read_jsonl(file_name):
+    data = []
+    with open(file_name, 'r', encoding='utf-8') as f:
+        for line in f:
+            data.append(json.loads(line))
+    return data
+
+# 读取两个 JSONL 文件
+data1 = read_jsonl(input_file_1)
+data2 = read_jsonl(input_file_2)
+
+# 合并数据
+merged_data = data1 + data2
+
+# 随机打乱合并后的数据
+random.shuffle(merged_data)
+
+# 将打乱后的数据写入新的 JSONL 文件
+with open(output_file, 'w', encoding='utf-8') as f:
+    for entry in merged_data:
+        f.write(json.dumps(entry) + '\n')
+
+print(f'已将 {len(merged_data)} 条记录合并并打乱，保存到 {output_file}')