import os import mysql.connector TABLE_NAME = 'crispr_papers_info' input('你确定TABLE_NAME是{}吗?'.format(TABLE_NAME)) # phosphorus_synthesis excels_dir = os.path.join(os.path.dirname(__file__), 'CRISPR/CRISPR_engineered') if_file_path = os.path.join(os.path.dirname(__file__), 'CRISPR/2023JCR.xlsx') input('你确定导入文件夹是{}吗?'.format(excels_dir)) # MySQL connection setup connection = mysql.connector.connect( host='100.84.94.73', user='metadata_mat_papers', password='siat-mic', database='metadata_mat_papers' ) cursor = connection.cursor() # Function to check if a table exists def check_table_exists(table_name): cursor.execute(f""" SELECT COUNT(*) FROM information_schema.tables WHERE table_schema = DATABASE() AND table_name = '{table_name}' """) return cursor.fetchone()[0] == 1 # Function to create the table if it doesn't exist def create_table(table_name): if not check_table_exists(table_name): query = f""" CREATE TABLE IF NOT EXISTS `{table_name}` ( doi VARCHAR(255) PRIMARY KEY, unique_id VARCHAR(255), author TEXT, title TEXT, journal VARCHAR(255), year INT, volume VARCHAR(50), number VARCHAR(50), pages VARCHAR(50), month VARCHAR(50), issn VARCHAR(50), eissn VARCHAR(50), researcher_id TEXT, if2023 VARCHAR(50), if5 VARCHAR(50), journal_index VARCHAR(50), jcr_quartile VARCHAR(50), orcid TEXT, early_access_date VARCHAR(50), scihub_downlowded VARCHAR(50), convert2md VARCHAR(50), pdf_url TEXT, md_url TEXT, abstract TEXT, image_url JSON, en_text_content LONGTEXT, cited_reference_count INT, doi_link TEXT, research_areas TEXT, unique_wos_id VARCHAR(255) ); """ cursor.execute(query) def record_exists(doi, table_name): query = f"SELECT COUNT(*) FROM `{table_name}` WHERE doi = %s" cursor.execute(query, (doi,)) count = cursor.fetchone()[0] return count > 0 # Function to insert a record into the MySQL database def insert_record(entry, table_name): # 定义列名列表 columns = [ 'doi', 'unique_id', 'author', 'title', 'journal', 'year', 'volume', 'number', 'pages', 'month', 'issn', 'eissn', 'researcher_id', 'if2023', 'if5', 'journal_index', 'jcr_quartile', 'orcid', 'early_access_date', 'scihub_downlowded', 'convert2md', 'pdf_url', 'md_url', 'abstract', 'image_url', 'text_content', 'cited_reference_count', 'doi_link', 'research_areas', 'unique_wos_id' ] # 构建SQL查询语句 placeholders = ', '.join(['%s'] * len(columns)) query = f""" INSERT INTO `{table_name}` ({', '.join(columns)}) VALUES ({placeholders}) """ values = ( entry.get('doi'), entry.get('unique-id'), entry.get('author'), entry.get('title'), entry.get('journal'), entry.get('year'), entry.get('volume'), entry.get('number', None), entry.get('pages', None), entry.get('month', None), entry.get('issn', None), entry.get('eissn', None), entry.get('researcherid-numbers', None), entry.get('if2023', None), entry.get('if5', None), entry.get('journal_index', None), entry.get('jcr_quartile', None), entry.get('ocrid-numbers', None), entry.get('earlyaccessdate', None), entry.get('scihub_downlowded', None), entry.get('convert2md', None), entry.get('pdf_url', None), entry.get('md_url', None), entry.get('abstract', None), entry.get('image_url', None), entry.get('text_content', None), entry.get('cited_reference_count', None), entry.get('doi_link', None), entry.get('research_areas', None), entry.get('unique_wos_id', None) ) cursor.execute(query, values) # 用pandas打开excel文件 import pandas as pd df = pd.read_excel(if_file_path) # 替换所有的nan为None df = df.replace({pd.NA: None}) # Create the table if it doesn't exist create_table(TABLE_NAME) excels_file_list = [] for file in os.listdir(excels_dir): # os.listdir('溶剂热文献-230505-swx-V3') if file.endswith('.xls'): excels_file_list.append(os.path.splitext(file)[0]) for excels_file in excels_file_list: print(os.path.join(excels_dir, excels_file + '.xls')) # 指定Excel文件路径 file_path = os.path.join(excels_dir, excels_file + '.xls') # 读取Excel文件 excel_df = pd.read_excel(file_path) # 替换所有的nan为None excel_df = excel_df.replace({pd.NA: None}) # 显示DataFrame的前几行 # print(df.head(5)) for i in range(len(excel_df)): entry = dict() entry['doi'] = str(excel_df.loc[i, 'DOI']) entry['title'] = str(excel_df.loc[i, 'Article Title']) entry['journal'] = str(excel_df.loc[i, 'Source Title']) entry['abstract'] = str(excel_df.loc[i, 'Abstract']) entry['cited_reference_count'] = int(excel_df.loc[i, 'Cited Reference Count']) entry['year'] = int(excel_df.loc[i, 'Publication Year']) entry['doi_link'] = str(excel_df.loc[i, 'DOI Link']) entry['research_areas'] = str(excel_df.loc[i, 'Research Areas']) entry['unique_wos_id'] = str(excel_df.loc[i, 'UT (Unique WOS ID)']) journal = entry.get('journal') if journal is not None: journal_lower = journal.lower() # 将期刊名称转为小写以进行不区分大小写的匹配 matching_journal = df[df['JournalName'].str.lower() == journal_lower] # 在DataFrame中查找该期刊 if not matching_journal.empty: entry['if2023'] = matching_journal['IF2023'].values[0] entry['if5'] = matching_journal['IF5'].values[0] entry['journal_index'] = matching_journal['INDEX'].values[0] entry['jcr_quartile'] = matching_journal['Quartile'].values[0] doi = entry.get('doi') # 先检查记录是否存在,同时doi不能为空 if not record_exists(doi, TABLE_NAME) and doi is not None: insert_record(entry, TABLE_NAME) # Commit the changes and close the connection connection.commit() cursor.close() connection.close() print("Data has been inserted into the database!")