Files
datapipe/clean/stp1_bib2sql.py
2025-01-18 17:09:51 +08:00

160 lines
5.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import os
import glob
import mysql.connector
import bibtexparser
import tqdm
TABLE_NAME = 'phosphorus_synthesis_info'
input('你确定TABLE_NAME是{}吗?'.format(TABLE_NAME))
# phosphorus_synthesis
bibs_dir = os.path.join(os.path.dirname(__file__), 'synthesis23-25')
if_file_path = os.path.join(os.path.dirname(__file__), '2023JCR.xlsx')
input('你确定导入文件夹是{}吗?'.format(bibs_dir))
# MySQL connection setup
connection = mysql.connector.connect(
host='localhost',
user='metadata_mat_papers',
password='siat-mic',
database='metadata_mat_papers'
)
cursor = connection.cursor()
# Function to check if a table exists
def check_table_exists(table_name):
cursor.execute(f"""
SELECT COUNT(*)
FROM information_schema.tables
WHERE table_schema = DATABASE()
AND table_name = '{table_name}'
""")
return cursor.fetchone()[0] == 1
# Function to create the table if it doesn't exist
def create_table(table_name):
if not check_table_exists(table_name):
query = f"""
CREATE TABLE IF NOT EXISTS `{table_name}` (
doi VARCHAR(255) PRIMARY KEY,
unique_id VARCHAR(255),
author TEXT,
title TEXT,
journal VARCHAR(255),
year INT,
volume VARCHAR(50),
number VARCHAR(50),
pages VARCHAR(50),
month VARCHAR(50),
issn VARCHAR(50),
eissn VARCHAR(50),
researcher_id TEXT,
if2023 VARCHAR(50),
if5 VARCHAR(50),
journal_index VARCHAR(50),
jcr_quartile VARCHAR(50),
orcid TEXT,
early_access_date VARCHAR(50),
scihub_downlowded VARCHAR(50),
convert2md VARCHAR(50),
pdf_url TEXT,
md_url TEXT,
abstract TEXT,
image_url JSON,
text_content LONGTEXT
);
"""
cursor.execute(query)
def record_exists(doi, table_name):
query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s"
cursor.execute(query, (doi,))
count = cursor.fetchone()[0]
return count > 0
# Function to insert a record into the MySQL database
def insert_record(entry, table_name):
# 定义列名列表
columns = [
'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume',
'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile',
'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url', 'text_content'
]
# 构建SQL查询语句
placeholders = ', '.join(['%s'] * len(columns))
query = f"""
INSERT INTO `{table_name}` ({', '.join(columns)})
VALUES ({placeholders})
"""
values = (
entry.get('doi'),
entry.get('unique-id'),
entry.get('author'),
entry.get('title'),
entry.get('journal'),
entry.get('year'),
entry.get('volume'),
entry.get('number', None),
entry.get('pages', None),
entry.get('month', None),
entry.get('issn', None),
entry.get('eissn', None),
entry.get('researcherid-numbers', None),
entry.get('if2023', None),
entry.get('if5', None),
entry.get('journal_index', None),
entry.get('jcr_quartile', None),
entry.get('ocrid-numbers', None),
entry.get('earlyaccessdate', None),
entry.get('scihub_downlowded', None),
entry.get('convert2md', None),
entry.get('pdf_url', None),
entry.get('md_url', None),
entry.get('abstract', None),
entry.get('image_url', None),
entry.get('text_content', None)
)
cursor.execute(query, values)
# 用pandas打开excel文件
import pandas as pd
df = pd.read_excel(if_file_path)
# 替换所有的nan为None
df = df.replace({pd.NA: None})
# Create the table if it doesn't exist
create_table(TABLE_NAME)
bib_files = sorted(glob.glob(os.path.join(bibs_dir, '*.bib')))
for bib_file in tqdm.tqdm(bib_files):
# Read and parse the .bib file
with open(bib_file, 'r') as bibtex_file:
bib_database = bibtexparser.load(bibtex_file)
for entry in bib_database.entries:
entry = {k.lower(): v for k, v in entry.items()}
journal = entry.get('journal')
if journal is not None:
journal_lower = journal.lower() # 将期刊名称转为小写以进行不区分大小写的匹配
matching_journal = df[df['JournalName'].str.lower() == journal_lower] # 在DataFrame中查找该期刊
if not matching_journal.empty:
entry['if2023'] = matching_journal['IF2023'].values[0]
entry['if5'] = matching_journal['IF5'].values[0]
entry['journal_index'] = matching_journal['INDEX'].values[0]
entry['jcr_quartile'] = matching_journal['Quartile'].values[0]
doi = entry.get('doi')
# 先检查记录是否存在同时doi不能为空
if not record_exists(doi, TABLE_NAME) and doi is not None:
insert_record(entry, TABLE_NAME)
# Commit the changes and close the connection
connection.commit()
cursor.close()
connection.close()
print("Data has been inserted into the database!")