layer2 commit

2025-05-28 11:00:24 +08:00
parent 6a6b09ae20
commit 9f5318c23d
66 changed files with 286574 additions and 0 deletions
--- a/layer2/PGEE/code/step0_xlsx2json.py
+++ b/layer2/PGEE/code/step0_xlsx2json.py
@@ -0,0 +1,46 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+import pandas as pd
+import json
+import os
+
+def process_excel_files(directory):
+    all_data = []
+    
+    # 获取目录下所有xlsx文件
+    excel_files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]
+    
+    for excel_file in excel_files:
+           
+        file_path = os.path.join(directory, excel_file)
+
+        df = pd.read_excel(file_path)
+        
+        if 'Question' in df.columns and 'Answer' in df.columns:
+            # 将每行转换为字典并添加到列表中
+            for _, row in df.iterrows():
+                data_item = {
+                    'question': str(row['Question']).strip(),
+                    'answer': str(row['Answer']).strip()
+                }
+                all_data.append(data_item)
+        else:
+            print(f"警告: {excel_file} 缺少必要的列 (question/answer)")
+
+    # 将数据保存为JSON文件
+    output_file = os.path.join(directory, 'qa_data.json')
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(all_data, f, ensure_ascii=False, indent=2)
+    
+    print(f"处理完成！共处理了 {len(all_data)} 条数据")
+    print(f"数据已保存到: {output_file}")
+
+if __name__ == '__main__':
+    # 指定Excel文件所在的目录
+    directory = os.path.dirname(os.path.abspath(__file__))
+    process_excel_files(directory)