第一次合并clean代码

This commit is contained in:
2025-01-18 17:09:51 +08:00
parent e33a8b069e
commit a0f5ca9a35
21 changed files with 2252 additions and 375 deletions

193
clean/stp1_excel2sql.py Normal file
View File

@@ -0,0 +1,193 @@
import os
import mysql.connector
TABLE_NAME = 'crispr_papers_info'
input('你确定TABLE_NAME是{}吗?'.format(TABLE_NAME))
# phosphorus_synthesis
excels_dir = os.path.join(os.path.dirname(__file__), 'CRISPR/CRISPR_engineered')
if_file_path = os.path.join(os.path.dirname(__file__), 'CRISPR/2023JCR.xlsx')
input('你确定导入文件夹是{}吗?'.format(excels_dir))
# MySQL connection setup
connection = mysql.connector.connect(
host='100.84.94.73',
user='metadata_mat_papers',
password='siat-mic',
database='metadata_mat_papers'
)
cursor = connection.cursor()
# Function to check if a table exists
def check_table_exists(table_name):
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_schema = DATABASE()
AND table_name = '{table_name}'
""")
return cursor.fetchone()[0] == 1
# Function to create the table if it doesn't exist
def create_table(table_name):
if not check_table_exists(table_name):
query = f"""
CREATE TABLE IF NOT EXISTS `{table_name}` (
doi VARCHAR(255) PRIMARY KEY,
unique_id VARCHAR(255),
author TEXT,
title TEXT,
journal VARCHAR(255),
year INT,
volume VARCHAR(50),
number VARCHAR(50),
pages VARCHAR(50),
month VARCHAR(50),
issn VARCHAR(50),
eissn VARCHAR(50),
researcher_id TEXT,
if2023 VARCHAR(50),
if5 VARCHAR(50),
journal_index VARCHAR(50),
jcr_quartile VARCHAR(50),
orcid TEXT,
early_access_date VARCHAR(50),
scihub_downlowded VARCHAR(50),
convert2md VARCHAR(50),
pdf_url TEXT,
md_url TEXT,
abstract TEXT,
image_url JSON,
en_text_content LONGTEXT,
cited_reference_count INT,
doi_link TEXT,
research_areas TEXT,
unique_wos_id VARCHAR(255)
);
"""
cursor.execute(query)
def record_exists(doi, table_name):
query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s"
cursor.execute(query, (doi,))
count = cursor.fetchone()[0]
return count > 0
# Function to insert a record into the MySQL database
def insert_record(entry, table_name):
# 定义列名列表
columns = [
'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume',
'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile',
'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url',
'text_content', 'cited_reference_count', 'doi_link', 'research_areas', 'unique_wos_id'
]
# 构建SQL查询语句
placeholders = ', '.join(['%s'] * len(columns))
query = f"""
INSERT INTO `{table_name}` ({', '.join(columns)})
VALUES ({placeholders})
"""
values = (
entry.get('doi'),
entry.get('unique-id'),
entry.get('author'),
entry.get('title'),
entry.get('journal'),
entry.get('year'),
entry.get('volume'),
entry.get('number', None),
entry.get('pages', None),
entry.get('month', None),
entry.get('issn', None),
entry.get('eissn', None),
entry.get('researcherid-numbers', None),
entry.get('if2023', None),
entry.get('if5', None),
entry.get('journal_index', None),
entry.get('jcr_quartile', None),
entry.get('ocrid-numbers', None),
entry.get('earlyaccessdate', None),
entry.get('scihub_downlowded', None),
entry.get('convert2md', None),
entry.get('pdf_url', None),
entry.get('md_url', None),
entry.get('abstract', None),
entry.get('image_url', None),
entry.get('text_content', None),
entry.get('cited_reference_count', None),
entry.get('doi_link', None),
entry.get('research_areas', None),
entry.get('unique_wos_id', None)
)
cursor.execute(query, values)
# 用pandas打开excel文件
import pandas as pd
df = pd.read_excel(if_file_path)
# 替换所有的nan为None
df = df.replace({pd.NA: None})
# Create the table if it doesn't exist
create_table(TABLE_NAME)
excels_file_list = []
for file in os.listdir(excels_dir): # os.listdir('溶剂热文献-230505-swx-V3')
if file.endswith('.xls'):
excels_file_list.append(os.path.splitext(file)[0])
for excels_file in excels_file_list:
print(os.path.join(excels_dir, excels_file + '.xls'))
# 指定Excel文件路径
file_path = os.path.join(excels_dir, excels_file + '.xls')
# 读取Excel文件
excel_df = pd.read_excel(file_path)
# 替换所有的nan为None
excel_df = excel_df.replace({pd.NA: None})
# 显示DataFrame的前几行
# print(df.head(5))
for i in range(len(excel_df)):
entry = dict()
entry['doi'] = str(excel_df.loc[i, 'DOI'])
entry['title'] = str(excel_df.loc[i, 'Article Title'])
entry['journal'] = str(excel_df.loc[i, 'Source Title'])
entry['abstract'] = str(excel_df.loc[i, 'Abstract'])
entry['cited_reference_count'] = int(excel_df.loc[i, 'Cited Reference Count'])
entry['year'] = int(excel_df.loc[i, 'Publication Year'])
entry['doi_link'] = str(excel_df.loc[i, 'DOI Link'])
entry['research_areas'] = str(excel_df.loc[i, 'Research Areas'])
entry['unique_wos_id'] = str(excel_df.loc[i, 'UT (Unique WOS ID)'])
journal = entry.get('journal')
if journal is not None:
journal_lower = journal.lower() # 将期刊名称转为小写以进行不区分大小写的匹配
matching_journal = df[df['JournalName'].str.lower() == journal_lower] # 在DataFrame中查找该期刊
if not matching_journal.empty:
entry['if2023'] = matching_journal['IF2023'].values[0]
entry['if5'] = matching_journal['IF5'].values[0]
entry['journal_index'] = matching_journal['INDEX'].values[0]
entry['jcr_quartile'] = matching_journal['Quartile'].values[0]
doi = entry.get('doi')
# 先检查记录是否存在同时doi不能为空
if not record_exists(doi, TABLE_NAME) and doi is not None:
insert_record(entry, TABLE_NAME)
# Commit the changes and close the connection
connection.commit()
cursor.close()
connection.close()
print("Data has been inserted into the database!")