Files
datapipe/clean/stp1_excel2sql.py
2025-01-18 17:09:51 +08:00

193 lines
6.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import mysql.connector
TABLE_NAME = 'crispr_papers_info'
input('你确定TABLE_NAME是{}吗?'.format(TABLE_NAME))
# phosphorus_synthesis
excels_dir = os.path.join(os.path.dirname(__file__), 'CRISPR/CRISPR_engineered')
if_file_path = os.path.join(os.path.dirname(__file__), 'CRISPR/2023JCR.xlsx')
input('你确定导入文件夹是{}吗?'.format(excels_dir))
# MySQL connection setup
connection = mysql.connector.connect(
host='100.84.94.73',
user='metadata_mat_papers',
password='siat-mic',
database='metadata_mat_papers'
)
cursor = connection.cursor()
# Function to check if a table exists
def check_table_exists(table_name):
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_schema = DATABASE()
AND table_name = '{table_name}'
""")
return cursor.fetchone()[0] == 1
# Function to create the table if it doesn't exist
def create_table(table_name):
if not check_table_exists(table_name):
query = f"""
CREATE TABLE IF NOT EXISTS `{table_name}` (
doi VARCHAR(255) PRIMARY KEY,
unique_id VARCHAR(255),
author TEXT,
title TEXT,
journal VARCHAR(255),
year INT,
volume VARCHAR(50),
number VARCHAR(50),
pages VARCHAR(50),
month VARCHAR(50),
issn VARCHAR(50),
eissn VARCHAR(50),
researcher_id TEXT,
if2023 VARCHAR(50),
if5 VARCHAR(50),
journal_index VARCHAR(50),
jcr_quartile VARCHAR(50),
orcid TEXT,
early_access_date VARCHAR(50),
scihub_downlowded VARCHAR(50),
convert2md VARCHAR(50),
pdf_url TEXT,
md_url TEXT,
abstract TEXT,
image_url JSON,
en_text_content LONGTEXT,
cited_reference_count INT,
doi_link TEXT,
research_areas TEXT,
unique_wos_id VARCHAR(255)
);
"""
cursor.execute(query)
def record_exists(doi, table_name):
query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s"
cursor.execute(query, (doi,))
count = cursor.fetchone()[0]
return count > 0
# Function to insert a record into the MySQL database
def insert_record(entry, table_name):
# 定义列名列表
columns = [
'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume',
'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile',
'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url',
'text_content', 'cited_reference_count', 'doi_link', 'research_areas', 'unique_wos_id'
]
# 构建SQL查询语句
placeholders = ', '.join(['%s'] * len(columns))
query = f"""
INSERT INTO `{table_name}` ({', '.join(columns)})
VALUES ({placeholders})
"""
values = (
entry.get('doi'),
entry.get('unique-id'),
entry.get('author'),
entry.get('title'),
entry.get('journal'),
entry.get('year'),
entry.get('volume'),
entry.get('number', None),
entry.get('pages', None),
entry.get('month', None),
entry.get('issn', None),
entry.get('eissn', None),
entry.get('researcherid-numbers', None),
entry.get('if2023', None),
entry.get('if5', None),
entry.get('journal_index', None),
entry.get('jcr_quartile', None),
entry.get('ocrid-numbers', None),
entry.get('earlyaccessdate', None),
entry.get('scihub_downlowded', None),
entry.get('convert2md', None),
entry.get('pdf_url', None),
entry.get('md_url', None),
entry.get('abstract', None),
entry.get('image_url', None),
entry.get('text_content', None),
entry.get('cited_reference_count', None),
entry.get('doi_link', None),
entry.get('research_areas', None),
entry.get('unique_wos_id', None)
)
cursor.execute(query, values)
# 用pandas打开excel文件
import pandas as pd
df = pd.read_excel(if_file_path)
# 替换所有的nan为None
df = df.replace({pd.NA: None})
# Create the table if it doesn't exist
create_table(TABLE_NAME)
excels_file_list = []
for file in os.listdir(excels_dir): # os.listdir('溶剂热文献-230505-swx-V3')
if file.endswith('.xls'):
excels_file_list.append(os.path.splitext(file)[0])
for excels_file in excels_file_list:
print(os.path.join(excels_dir, excels_file + '.xls'))
# 指定Excel文件路径
file_path = os.path.join(excels_dir, excels_file + '.xls')
# 读取Excel文件
excel_df = pd.read_excel(file_path)
# 替换所有的nan为None
excel_df = excel_df.replace({pd.NA: None})
# 显示DataFrame的前几行
# print(df.head(5))
for i in range(len(excel_df)):
entry = dict()
entry['doi'] = str(excel_df.loc[i, 'DOI'])
entry['title'] = str(excel_df.loc[i, 'Article Title'])
entry['journal'] = str(excel_df.loc[i, 'Source Title'])
entry['abstract'] = str(excel_df.loc[i, 'Abstract'])
entry['cited_reference_count'] = int(excel_df.loc[i, 'Cited Reference Count'])
entry['year'] = int(excel_df.loc[i, 'Publication Year'])
entry['doi_link'] = str(excel_df.loc[i, 'DOI Link'])
entry['research_areas'] = str(excel_df.loc[i, 'Research Areas'])
entry['unique_wos_id'] = str(excel_df.loc[i, 'UT (Unique WOS ID)'])
journal = entry.get('journal')
if journal is not None:
journal_lower = journal.lower() # 将期刊名称转为小写以进行不区分大小写的匹配
matching_journal = df[df['JournalName'].str.lower() == journal_lower] # 在DataFrame中查找该期刊
if not matching_journal.empty:
entry['if2023'] = matching_journal['IF2023'].values[0]
entry['if5'] = matching_journal['IF5'].values[0]
entry['journal_index'] = matching_journal['INDEX'].values[0]
entry['jcr_quartile'] = matching_journal['Quartile'].values[0]
doi = entry.get('doi')
# 先检查记录是否存在同时doi不能为空
if not record_exists(doi, TABLE_NAME) and doi is not None:
insert_record(entry, TABLE_NAME)
# Commit the changes and close the connection
connection.commit()
cursor.close()
connection.close()
print("Data has been inserted into the database!")