Files
datapipe/clean/step3_path_change_with_database.py
2025-01-18 17:09:51 +08:00

52 lines
1.6 KiB
Python

import sqlite3
import mysql.connector
import tqdm
import os
TABLE_NAME = 'mp_cif_info'
input('TABLE_NAME = {} ?'.format(TABLE_NAME))
cur_dir = os.path.dirname(os.path.abspath(__file__))
# MySQL connection setup
mysql_connection = mysql.connector.connect(
host='100.84.94.73',
user='metadata_mat_papers',
password='siat-mic',
database='metadata_mat_papers'
)
try:
mysql_cursor = mysql_connection.cursor()
# 获取所有下载为 success 的 doi
query = f"SELECT doi, pdf_url FROM {TABLE_NAME} WHERE scihub_downloaded = 'success';"
mysql_cursor.execute(query)
results = mysql_cursor.fetchall()
dois = [row[0] for row in results]
pdf_urls = [row[1] for row in results]
for doi, pdf_url in tqdm.tqdm(zip(dois, pdf_urls), total=len(dois)):
# 若是已经修改过的,则直接跳过
if pdf_url is not None and pdf_url.split('/')[0] == 'mp_cif' and pdf_url.split('/')[1] == 'pdfs':
continue
# pdf = doi.replace('/','_').replace('<','_').replace('>','_').replace(':','_') + '.pdf'
pdf = doi + '.pdf'
# 新的路径
pdf_path = os.path.join('mp_cif/pdfs', pdf)
query = f"UPDATE {TABLE_NAME} SET pdf_url = '{pdf_path}' WHERE doi = '{doi}';"
mysql_cursor.execute(query)
mysql_connection.commit()
# 提交更改到数据库
mysql_connection.commit()
except mysql.connector.Error as error:
print("Failed to insert record into MySQL table: {}".format(error))
# 如果发生错误,撤回事务
mysql_connection.rollback()
finally:
# 关闭游标和连接
mysql_cursor.close()
mysql_connection.close()