第一次合并clean代码
This commit is contained in:
160
clean/stp1_bib2sql.py
Normal file
160
clean/stp1_bib2sql.py
Normal file
@@ -0,0 +1,160 @@
|
||||
import os
|
||||
import glob
|
||||
import mysql.connector
|
||||
import bibtexparser
|
||||
import tqdm
|
||||
|
||||
|
||||
TABLE_NAME = 'phosphorus_synthesis_info'
|
||||
input('你确定TABLE_NAME是{}吗?'.format(TABLE_NAME))
|
||||
|
||||
# phosphorus_synthesis
|
||||
bibs_dir = os.path.join(os.path.dirname(__file__), 'synthesis23-25')
|
||||
if_file_path = os.path.join(os.path.dirname(__file__), '2023JCR.xlsx')
|
||||
input('你确定导入文件夹是{}吗?'.format(bibs_dir))
|
||||
|
||||
# MySQL connection setup
|
||||
connection = mysql.connector.connect(
|
||||
host='localhost',
|
||||
user='metadata_mat_papers',
|
||||
password='siat-mic',
|
||||
database='metadata_mat_papers'
|
||||
)
|
||||
cursor = connection.cursor()
|
||||
|
||||
|
||||
# Function to check if a table exists
|
||||
def check_table_exists(table_name):
|
||||
cursor.execute(f"""
|
||||
SELECT COUNT(*)
|
||||
FROM information_schema.tables
|
||||
WHERE table_schema = DATABASE()
|
||||
AND table_name = '{table_name}'
|
||||
""")
|
||||
return cursor.fetchone()[0] == 1
|
||||
|
||||
# Function to create the table if it doesn't exist
|
||||
def create_table(table_name):
|
||||
if not check_table_exists(table_name):
|
||||
query = f"""
|
||||
CREATE TABLE IF NOT EXISTS `{table_name}` (
|
||||
doi VARCHAR(255) PRIMARY KEY,
|
||||
unique_id VARCHAR(255),
|
||||
author TEXT,
|
||||
title TEXT,
|
||||
journal VARCHAR(255),
|
||||
year INT,
|
||||
volume VARCHAR(50),
|
||||
number VARCHAR(50),
|
||||
pages VARCHAR(50),
|
||||
month VARCHAR(50),
|
||||
issn VARCHAR(50),
|
||||
eissn VARCHAR(50),
|
||||
researcher_id TEXT,
|
||||
if2023 VARCHAR(50),
|
||||
if5 VARCHAR(50),
|
||||
journal_index VARCHAR(50),
|
||||
jcr_quartile VARCHAR(50),
|
||||
orcid TEXT,
|
||||
early_access_date VARCHAR(50),
|
||||
scihub_downlowded VARCHAR(50),
|
||||
convert2md VARCHAR(50),
|
||||
pdf_url TEXT,
|
||||
md_url TEXT,
|
||||
abstract TEXT,
|
||||
image_url JSON,
|
||||
text_content LONGTEXT
|
||||
);
|
||||
"""
|
||||
cursor.execute(query)
|
||||
|
||||
def record_exists(doi, table_name):
|
||||
query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s"
|
||||
cursor.execute(query, (doi,))
|
||||
count = cursor.fetchone()[0]
|
||||
return count > 0
|
||||
|
||||
# Function to insert a record into the MySQL database
|
||||
def insert_record(entry, table_name):
|
||||
# 定义列名列表
|
||||
columns = [
|
||||
'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume',
|
||||
'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile',
|
||||
'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url', 'text_content'
|
||||
]
|
||||
|
||||
# 构建SQL查询语句
|
||||
placeholders = ', '.join(['%s'] * len(columns))
|
||||
query = f"""
|
||||
INSERT INTO `{table_name}` ({', '.join(columns)})
|
||||
VALUES ({placeholders})
|
||||
"""
|
||||
|
||||
values = (
|
||||
entry.get('doi'),
|
||||
entry.get('unique-id'),
|
||||
entry.get('author'),
|
||||
entry.get('title'),
|
||||
entry.get('journal'),
|
||||
entry.get('year'),
|
||||
entry.get('volume'),
|
||||
entry.get('number', None),
|
||||
entry.get('pages', None),
|
||||
entry.get('month', None),
|
||||
entry.get('issn', None),
|
||||
entry.get('eissn', None),
|
||||
entry.get('researcherid-numbers', None),
|
||||
entry.get('if2023', None),
|
||||
entry.get('if5', None),
|
||||
entry.get('journal_index', None),
|
||||
entry.get('jcr_quartile', None),
|
||||
entry.get('ocrid-numbers', None),
|
||||
entry.get('earlyaccessdate', None),
|
||||
entry.get('scihub_downlowded', None),
|
||||
entry.get('convert2md', None),
|
||||
entry.get('pdf_url', None),
|
||||
entry.get('md_url', None),
|
||||
entry.get('abstract', None),
|
||||
entry.get('image_url', None),
|
||||
entry.get('text_content', None)
|
||||
)
|
||||
cursor.execute(query, values)
|
||||
|
||||
|
||||
|
||||
# 用pandas打开excel文件
|
||||
import pandas as pd
|
||||
df = pd.read_excel(if_file_path)
|
||||
# 替换所有的nan为None
|
||||
df = df.replace({pd.NA: None})
|
||||
|
||||
# Create the table if it doesn't exist
|
||||
create_table(TABLE_NAME)
|
||||
|
||||
bib_files = sorted(glob.glob(os.path.join(bibs_dir, '*.bib')))
|
||||
for bib_file in tqdm.tqdm(bib_files):
|
||||
# Read and parse the .bib file
|
||||
with open(bib_file, 'r') as bibtex_file:
|
||||
bib_database = bibtexparser.load(bibtex_file)
|
||||
for entry in bib_database.entries:
|
||||
entry = {k.lower(): v for k, v in entry.items()}
|
||||
journal = entry.get('journal')
|
||||
if journal is not None:
|
||||
journal_lower = journal.lower() # 将期刊名称转为小写以进行不区分大小写的匹配
|
||||
matching_journal = df[df['JournalName'].str.lower() == journal_lower] # 在DataFrame中查找该期刊
|
||||
if not matching_journal.empty:
|
||||
entry['if2023'] = matching_journal['IF2023'].values[0]
|
||||
entry['if5'] = matching_journal['IF5'].values[0]
|
||||
entry['journal_index'] = matching_journal['INDEX'].values[0]
|
||||
entry['jcr_quartile'] = matching_journal['Quartile'].values[0]
|
||||
|
||||
doi = entry.get('doi')
|
||||
# 先检查记录是否存在,同时doi不能为空
|
||||
if not record_exists(doi, TABLE_NAME) and doi is not None:
|
||||
insert_record(entry, TABLE_NAME)
|
||||
|
||||
# Commit the changes and close the connection
|
||||
connection.commit()
|
||||
cursor.close()
|
||||
connection.close()
|
||||
print("Data has been inserted into the database!")
|
||||
Reference in New Issue
Block a user