layer2 commit
This commit is contained in:
46
layer2/PGEE/code/step0_xlsx2json.py
Normal file
46
layer2/PGEE/code/step0_xlsx2json.py
Normal file
@@ -0,0 +1,46 @@
|
||||
"""
|
||||
0. 将问题从xls提取为json
|
||||
1. 将问题进行拆分
|
||||
2. 翻译成英文
|
||||
3. 去重
|
||||
4. 使用大模型进行难度评估和筛选
|
||||
"""
|
||||
import pandas as pd
|
||||
import json
|
||||
import os
|
||||
|
||||
def process_excel_files(directory):
|
||||
all_data = []
|
||||
|
||||
# 获取目录下所有xlsx文件
|
||||
excel_files = [f for f in os.listdir(directory) if f.endswith('.xlsx')]
|
||||
|
||||
for excel_file in excel_files:
|
||||
|
||||
file_path = os.path.join(directory, excel_file)
|
||||
|
||||
df = pd.read_excel(file_path)
|
||||
|
||||
if 'Question' in df.columns and 'Answer' in df.columns:
|
||||
# 将每行转换为字典并添加到列表中
|
||||
for _, row in df.iterrows():
|
||||
data_item = {
|
||||
'question': str(row['Question']).strip(),
|
||||
'answer': str(row['Answer']).strip()
|
||||
}
|
||||
all_data.append(data_item)
|
||||
else:
|
||||
print(f"警告: {excel_file} 缺少必要的列 (question/answer)")
|
||||
|
||||
# 将数据保存为JSON文件
|
||||
output_file = os.path.join(directory, 'qa_data.json')
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(all_data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"处理完成!共处理了 {len(all_data)} 条数据")
|
||||
print(f"数据已保存到: {output_file}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
# 指定Excel文件所在的目录
|
||||
directory = os.path.dirname(os.path.abspath(__file__))
|
||||
process_excel_files(directory)
|
||||
Reference in New Issue
Block a user