第一次合并clean代码
This commit is contained in:
88
clean/step1_modify_status_with_database.py
Normal file
88
clean/step1_modify_status_with_database.py
Normal file
@@ -0,0 +1,88 @@
|
||||
import os
|
||||
import tqdm
|
||||
import sqlite3
|
||||
import mysql.connector
|
||||
import PyPDF2
|
||||
|
||||
def read_dois_from_db(db_path):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f"SELECT doi FROM doi_status;")
|
||||
dois = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
return dois
|
||||
|
||||
def main():
|
||||
cur_path = os.path.dirname(os.path.abspath(__file__))
|
||||
# db_path = os.path.join(cur_path, 'psk_high_cited', 'doi_status.db')
|
||||
# dois_db = read_dois_from_db(db_path)
|
||||
|
||||
# for doi in tqdm.tqdm(dois_db):
|
||||
# pdf = doi.replace('/','_').replace('<','_').replace('>','_').replace(':','_') + '.pdf'
|
||||
# pdf_path = os.path.join(cur_path, 'psk_high_cited/pdfs', pdf)
|
||||
# if os.path.exists(pdf_path):
|
||||
# conn = sqlite3.connect(db_path)
|
||||
# cursor = conn.cursor()
|
||||
# cursor.execute(f"UPDATE doi_status SET status = 'success' WHERE doi = '{doi}';")
|
||||
# conn.close()
|
||||
|
||||
###########################################################################################
|
||||
|
||||
TABLE_NAME = 'mp_cif_info'
|
||||
|
||||
mysql_connection = mysql.connector.connect(
|
||||
host='100.84.94.73',
|
||||
user='metadata_mat_papers',
|
||||
password='siat-mic',
|
||||
database='metadata_mat_papers'
|
||||
)
|
||||
mysql_cursor = mysql_connection.cursor()
|
||||
|
||||
try:
|
||||
# 获取所有 doi
|
||||
mysql_cursor.execute(f"SELECT doi FROM {TABLE_NAME};")
|
||||
dois = [row[0] for row in mysql_cursor.fetchall()]
|
||||
|
||||
for doi in tqdm.tqdm(dois):
|
||||
# pdf = doi.replace('/','_').replace('<','_').replace('>','_').replace(':','_') + '.pdf'
|
||||
pdf = doi + '.pdf'
|
||||
|
||||
# 需要更改为你的pdf路径
|
||||
pdf_path = os.path.join(cur_path, 'mp_cif/pdfs', pdf)
|
||||
|
||||
if os.path.exists(pdf_path):
|
||||
try:
|
||||
# 尝试打开PDF文件
|
||||
with open(pdf_path, 'rb') as file:
|
||||
pdf_reader = PyPDF2.PdfReader(file) # 如果无法解析,可能抛出异常
|
||||
|
||||
# 如果文件成功打开和解析,更新数据库状态为 'success'
|
||||
query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s"
|
||||
mysql_cursor.execute(query, ('success', doi))
|
||||
mysql_connection.commit()
|
||||
|
||||
except (PyPDF2.errors.PdfReadError, PyPDF2.errors.PdfStreamError):
|
||||
# 如果 PDF 解析失败,将 scihub_downlowded 设置为 NULL
|
||||
query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s"
|
||||
mysql_cursor.execute(query, (None, doi)) # None 会映射为 SQL 中的 NULL
|
||||
mysql_connection.commit()
|
||||
|
||||
except Exception as e:
|
||||
# 其他异常处理
|
||||
print(f"处理 PDF {doi} 时出现未知错误: {e}")
|
||||
query = f"UPDATE {TABLE_NAME} SET scihub_downloaded = %s WHERE doi = %s"
|
||||
mysql_cursor.execute(query, (None, doi))
|
||||
mysql_connection.commit()
|
||||
|
||||
except mysql.connector.Error as error:
|
||||
print("Failed to insert record into MySQL table: {}".format(error))
|
||||
# 如果发生错误,撤回事务
|
||||
mysql_connection.rollback()
|
||||
|
||||
finally:
|
||||
# 关闭游标和连接
|
||||
mysql_cursor.close()
|
||||
mysql_connection.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user