第一次合并clean代码

2025-01-18 17:09:51 +08:00
parent e33a8b069e
commit a0f5ca9a35
21 changed files with 2252 additions and 375 deletions
--- a/clean/stp1_bib2sql.py
+++ b/clean/stp1_bib2sql.py
@@ -0,0 +1,160 @@
+import os
+import glob
+import mysql.connector
+import bibtexparser
+import tqdm
+
+
+TABLE_NAME = 'phosphorus_synthesis_info'
+input('你确定TABLE_NAME是{}吗？'.format(TABLE_NAME))
+
+# phosphorus_synthesis 
+bibs_dir = os.path.join(os.path.dirname(__file__), 'synthesis23-25')
+if_file_path = os.path.join(os.path.dirname(__file__), '2023JCR.xlsx')
+input('你确定导入文件夹是{}吗？'.format(bibs_dir))
+
+# MySQL connection setup
+connection = mysql.connector.connect(
+    host='localhost',
+    user='metadata_mat_papers',
+    password='siat-mic',
+    database='metadata_mat_papers'
+)
+cursor = connection.cursor()
+
+
+# Function to check if a table exists
+def check_table_exists(table_name):
+    cursor.execute(f"""
+    SELECT COUNT(*)
+    FROM information_schema.tables
+    WHERE table_schema = DATABASE()
+    AND table_name = '{table_name}'
+    """)
+    return cursor.fetchone()[0] == 1
+
+# Function to create the table if it doesn't exist
+def create_table(table_name):
+    if not check_table_exists(table_name):
+        query = f"""
+        CREATE TABLE IF NOT EXISTS `{table_name}` (
+            doi VARCHAR(255) PRIMARY KEY,
+            unique_id VARCHAR(255),
+            author TEXT,
+            title TEXT,
+            journal VARCHAR(255),
+            year INT,
+            volume VARCHAR(50),
+            number VARCHAR(50),
+            pages VARCHAR(50),
+            month VARCHAR(50),
+            issn VARCHAR(50),
+            eissn VARCHAR(50),
+            researcher_id TEXT,
+            if2023 VARCHAR(50),
+            if5 VARCHAR(50),
+            journal_index VARCHAR(50),
+            jcr_quartile VARCHAR(50),
+            orcid TEXT,
+            early_access_date VARCHAR(50),
+            scihub_downlowded VARCHAR(50),
+            convert2md VARCHAR(50),
+            pdf_url TEXT,
+            md_url TEXT,
+            abstract TEXT,
+            image_url JSON,
+            text_content LONGTEXT
+        );
+        """
+        cursor.execute(query)
+
+def record_exists(doi, table_name):
+    query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s"
+    cursor.execute(query, (doi,))
+    count = cursor.fetchone()[0]
+    return count > 0
+
+# Function to insert a record into the MySQL database
+def insert_record(entry, table_name):
+     # 定义列名列表
+    columns = [
+        'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume', 
+        'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile', 
+        'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url', 'text_content'
+    ]
+    
+    # 构建SQL查询语句
+    placeholders = ', '.join(['%s'] * len(columns))
+    query = f"""
+    INSERT INTO `{table_name}` ({', '.join(columns)})
+    VALUES ({placeholders})
+    """
+    
+    values = (
+        entry.get('doi'),
+        entry.get('unique-id'),
+        entry.get('author'),
+        entry.get('title'),
+        entry.get('journal'),
+        entry.get('year'),
+        entry.get('volume'),
+        entry.get('number', None),
+        entry.get('pages', None),
+        entry.get('month', None),
+        entry.get('issn', None),
+        entry.get('eissn', None),
+        entry.get('researcherid-numbers', None),
+        entry.get('if2023', None),
+        entry.get('if5', None),
+        entry.get('journal_index', None),
+        entry.get('jcr_quartile', None),
+        entry.get('ocrid-numbers', None),
+        entry.get('earlyaccessdate', None),
+        entry.get('scihub_downlowded', None),
+        entry.get('convert2md', None),
+        entry.get('pdf_url', None),
+        entry.get('md_url', None),
+        entry.get('abstract', None),
+        entry.get('image_url', None),
+        entry.get('text_content', None)
+    )
+    cursor.execute(query, values)
+
+
+
+# 用pandas打开excel文件
+import pandas as pd
+df = pd.read_excel(if_file_path)
+# 替换所有的nan为None
+df = df.replace({pd.NA: None})
+
+# Create the table if it doesn't exist
+create_table(TABLE_NAME)
+
+bib_files = sorted(glob.glob(os.path.join(bibs_dir, '*.bib')))
+for bib_file in tqdm.tqdm(bib_files):
+    # Read and parse the .bib file
+    with open(bib_file, 'r') as bibtex_file:
+        bib_database = bibtexparser.load(bibtex_file)
+        for entry in bib_database.entries:
+            entry = {k.lower(): v for k, v in entry.items()}
+            journal = entry.get('journal')
+            if journal is not None:
+                journal_lower = journal.lower()   # 将期刊名称转为小写以进行不区分大小写的匹配
+                matching_journal = df[df['JournalName'].str.lower() == journal_lower]    # 在DataFrame中查找该期刊
+                if not matching_journal.empty:
+                    entry['if2023'] = matching_journal['IF2023'].values[0]
+                    entry['if5'] = matching_journal['IF5'].values[0]
+                    entry['journal_index'] = matching_journal['INDEX'].values[0]
+                    entry['jcr_quartile'] = matching_journal['Quartile'].values[0]
+            
+            doi = entry.get('doi')
+            # 先检查记录是否存在，同时doi不能为空
+            if not record_exists(doi, TABLE_NAME) and doi is not None:
+                insert_record(entry, TABLE_NAME)
+
+# Commit the changes and close the connection
+connection.commit()
+cursor.close()
+connection.close()
+print("Data has been inserted into the database!")