Files
datapipe/clean/stp2.2_remove_broken_pdf.py
2025-01-18 17:09:51 +08:00

29 lines
702 B
Python

import sqlite3
import mysql.connector
import tqdm
import os
TABLE_NAME = 'phosphorus_synthesis_info'
input('TABLE_NAME = {} ?'.format(TABLE_NAME))
cur_dir = os.path.dirname(os.path.abspath(__file__))
# MySQL connection setup
mysql_connection = mysql.connector.connect(
host='100.84.94.73',
user='metadata_mat_papers',
password='siat-mic',
database='metadata_mat_papers'
)
mysql_cursor = mysql_connection.cursor()
# 编写query语句
query = f"SELECT pdf_url FROM {TABLE_NAME} WHERE scihub_downlowded = 'broken'"
mysql_cursor.execute(query)
records = mysql_cursor.fetchall()
for record in tqdm.tqdm(records):
pdf_path = os.path.join(cur_dir, record[0])
os.remove(pdf_path)