import os import tqdm import sqlite3 import mysql.connector import PyPDF2 def read_dois_from_db(db_path): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(f"SELECT doi FROM doi_status;") dois = [row[0] for row in cursor.fetchall()] conn.close() return dois def main(): cur_path = os.path.dirname(os.path.abspath(__file__)) # db_path = os.path.join(cur_path, 'psk_high_cited', 'doi_status.db') # dois_db = read_dois_from_db(db_path) # for doi in tqdm.tqdm(dois_db): # pdf = doi.replace('/','_').replace('<','_').replace('>','_').replace(':','_') + '.pdf' # pdf_path = os.path.join(cur_path, 'psk_high_cited/pdfs', pdf) # if os.path.exists(pdf_path): # conn = sqlite3.connect(db_path) # cursor = conn.cursor() # cursor.execute(f"UPDATE doi_status SET status = 'success' WHERE doi = '{doi}';") # conn.close() ########################################################################################### TABLE_NAME = 'mp_cif_info' mysql_connection = mysql.connector.connect( host='100.84.94.73', user='metadata_mat_papers', password='siat-mic', database='metadata_mat_papers' ) mysql_cursor = mysql_connection.cursor() try: # 获取所有 doi mysql_cursor.execute(f"SELECT doi FROM {TABLE_NAME};") dois = [row[0] for row in mysql_cursor.fetchall()] for doi in tqdm.tqdm(dois): # pdf = doi.replace('/','_').replace('<','_').replace('>','_').replace(':','_') + '.pdf' pdf = doi + '.pdf' # 需要更改为你的pdf路径 pdf_path = os.path.join(cur_path, 'mp_cif/pdfs', pdf) if os.path.exists(pdf_path): try: # 尝试打开PDF文件 with open(pdf_path, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) # 如果无法解析,可能抛出异常 # 如果文件成功打开和解析,更新数据库状态为 'success' query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s" mysql_cursor.execute(query, ('success', doi)) mysql_connection.commit() except (PyPDF2.errors.PdfReadError, PyPDF2.errors.PdfStreamError): # 如果 PDF 解析失败,将 scihub_downlowded 设置为 NULL query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s" mysql_cursor.execute(query, (None, doi)) # None 会映射为 SQL 中的 NULL mysql_connection.commit() except Exception as e: # 其他异常处理 print(f"处理 PDF {doi} 时出现未知错误: {e}") query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s" mysql_cursor.execute(query, (None, doi)) mysql_connection.commit() except mysql.connector.Error as error: print("Failed to insert record into MySQL table: {}".format(error)) # 如果发生错误,撤回事务 mysql_connection.rollback() finally: # 关闭游标和连接 mysql_cursor.close() mysql_connection.close() if __name__ == "__main__": main()