Files
datapipe/clean/stp2_down_ipidea_multi.py
2025-01-18 17:09:51 +08:00

211 lines
8.0 KiB
Python

import os
import re
import time
import tqdm
import requests
import subprocess
import concurrent.futures
import sqlite3
from scidownl import scihub_download
import logging
import pymupdf
NUM_PROCESSES = 32 # 设置并发进程数
SCIHUB_URLS = [
"https://sci-hub.st/",
"https://sci-hub.se/",
"https://sci-hub.ru/"
]
PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num={NUM_PROCESSES}&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
SINGLE_PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num=1&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
DOI_PATTERN = re.compile(r"DOI\s*=\s*\{(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\}", re.IGNORECASE)
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] | %(asctime)s | %(message)s')
logger = logging.getLogger(__name__)
def get_directories(bib_dir_name, output_dirname):
current_path = os.path.dirname(os.path.abspath(__file__))
output_dir = os.path.join(current_path, output_dirname)
bib_dir_path = os.path.join(current_path, bib_dir_name)
db_path = os.path.join(current_path, "doi_status.db")
return output_dir, bib_dir_path, db_path
def create_directory_if_not_exists(directory):
os.makedirs(directory, exist_ok=True)
def fetch_proxies():
proxies = []
try:
response = requests.get(PROXY_SERVICE_URL)
if response.status_code == 200:
proxy_list = response.text.strip().split('\r\n')
for proxy in proxy_list:
proxies.append({
"http": f"http://{proxy}",
"https": f"http://{proxy}",
})
if proxies:
logger.info(f"Fetched proxies: {proxies}")
return proxies
except Exception as e:
logger.error(f"Error fetching proxies: {e}")
return None
def fetch_proxy():
proxies = []
try:
response = requests.get(SINGLE_PROXY_SERVICE_URL)
if response.status_code == 200:
proxy_list = response.text.strip().split('\r\n')
for proxy in proxy_list:
proxies.append({
"http": f"http://{proxy}",
"https": f"http://{proxy}",
})
if proxies:
logger.info(f"Fetched proxies: {proxies}")
return proxies
except Exception as e:
logger.error(f"Error fetching proxies: {e}")
return None
def read_dois_from_files(bib_dir_path):
all_dois = []
for bib_file_name in sorted(os.listdir(bib_dir_path)):
if bib_file_name.endswith(".bib"):
with open(os.path.join(bib_dir_path, bib_file_name), "r") as file:
dois = DOI_PATTERN.findall(file.read())
logger.info(f"{bib_file_name} has {len(dois)} doi(s)")
all_dois.extend(dois)
return list(set(all_dois))
def filter_downloaded_dois(all_dois, output_dir):
for doi in os.listdir(output_dir):
if doi.endswith(".pdf"):
doi = doi.replace(".pdf", "").replace("_", "/")
if doi in all_dois:
all_dois.remove(doi)
return all_dois
def read_dois_from_db(db_path, status):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute(f"SELECT doi FROM doi_status WHERE status = '{status}'")
dois = [row[0] for row in cursor.fetchall()]
conn.close()
return dois
def write_doi_to_db(db_path, doi, output_dirname, status):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute("INSERT OR REPLACE INTO doi_status (doi, status, pdf_url) VALUES (?, ?, ?)", (doi, status, f"{output_dirname}/{doi.replace('/', '_')}.pdf"))
conn.commit()
conn.close()
def initialize_db(db_path):
conn = sqlite3.connect(db_path)
cursor = conn.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS doi_status (
doi TEXT PRIMARY KEY,
status TEXT,
pdf_url TEXT
)
''')
conn.commit()
cursor.execute("PRAGMA journal_mode=WAL")
conn.commit()
conn.close()
def download_doi(doi, output_dir, proxy, scihub_urls, db_path):
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
output_dirname = output_dir.split("/")[-1]
for scihub_url in scihub_urls:
output_path = os.path.join(output_dir, f"{doi.replace('/', '_')}.pdf")
proxy_url = "https=" + proxy['https']
try:
result = subprocess.run(
['scidownl', 'download', '--doi', doi, '--out', output_path, '--scihub-url', scihub_url, '--proxy', proxy_url],
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
)
logger.info(result.stderr)
if "No pdf tag" in result.stderr:
timeout_dois.append(doi)
write_doi_to_db(db_path, doi, output_dirname, 'timeout')
break
elif "403" in result.stderr or "Unable to connect to proxy" in result.stderr or "504" in result.stderr or 'crawling_failed, error: HTTPSConnectionPool' in result.stderr:
logger.warning("Proxy error detected, fetching new proxy.")
proxy = fetch_proxy()[0]
# time.sleep(2)
continue
elif result.stdout.strip() != '':
try:
# 尝试打开pdf文件
with pymupdf.open(output_path) as pdf:
logger.info(f"Downloaded {doi} successfully.")
write_doi_to_db(db_path, doi, output_dirname, 'success')
success_dois.append(doi)
except:
write_doi_to_db(db_path, doi, output_dirname, 'broken')
logger.info(f"{doi}.pdf has been broken!")
broken_dois.append(doi)
break
else:
write_doi_to_db(db_path, doi, output_dirname, 'failed')
break
except subprocess.CalledProcessError as e:
logger.error(f"Error: {e}")
failed_dois.append(doi)
write_doi_to_db(db_path, doi, 'failed')
continue
return success_dois, broken_dois, failed_dois, timeout_dois
def download_dois(all_dois, output_dir, db_path):
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
proxies = fetch_proxies()
with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
futures = []
for i, doi in enumerate(all_dois):
proxy = proxies[i % len(proxies)]
futures.append(executor.submit(download_doi, doi, output_dir, proxy, SCIHUB_URLS, db_path))
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='Downloading DOIs', unit='doi'):
result = future.result()
if result:
success, broken, failed, timeout = result
success_dois.extend(success)
broken_dois.extend(broken)
failed_dois.extend(failed)
timeout_dois.extend(timeout)
logger.info(f"Success: {len(success_dois)}, Broken: {len(broken_dois)}, Failed: {len(failed_dois)}, Timeout: {len(timeout_dois)}")
def main():
bib_dir_name = "synthesis23-25"
output_dirname = "synthesis23-25_pdfs"
input('你确定是文件夹{}{}吗?'.format(bib_dir_name, output_dirname))
output_dir, bib_dir_path, db_path = get_directories(bib_dir_name, output_dirname)
create_directory_if_not_exists(output_dir)
initialize_db(db_path)
all_dois = read_dois_from_files(bib_dir_path)
logger.info(f"Total {len(all_dois)} doi(s)")
all_dois = filter_downloaded_dois(all_dois, output_dir)
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'success')]
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'failed')]
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'timeout')]
download_dois(all_dois, output_dir, db_path)
if __name__ == "__main__":
main()