import os import re import time import tqdm import requests import subprocess import concurrent.futures import sqlite3 from scidownl import scihub_download import logging import pymupdf NUM_PROCESSES = 32 # 设置并发进程数 SCIHUB_URLS = [ "https://sci-hub.st/", "https://sci-hub.se/", "https://sci-hub.ru/" ] PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num={NUM_PROCESSES}&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http" SINGLE_PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num=1&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http" DOI_PATTERN = re.compile(r"DOI\s*=\s*\{(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\}", re.IGNORECASE) logging.basicConfig(level=logging.INFO, format='[%(levelname)s] | %(asctime)s | %(message)s') logger = logging.getLogger(__name__) def get_directories(bib_dir_name, output_dirname): current_path = os.path.dirname(os.path.abspath(__file__)) output_dir = os.path.join(current_path, output_dirname) bib_dir_path = os.path.join(current_path, bib_dir_name) db_path = os.path.join(current_path, "doi_status.db") return output_dir, bib_dir_path, db_path def create_directory_if_not_exists(directory): os.makedirs(directory, exist_ok=True) def fetch_proxies(): proxies = [] try: response = requests.get(PROXY_SERVICE_URL) if response.status_code == 200: proxy_list = response.text.strip().split('\r\n') for proxy in proxy_list: proxies.append({ "http": f"http://{proxy}", "https": f"http://{proxy}", }) if proxies: logger.info(f"Fetched proxies: {proxies}") return proxies except Exception as e: logger.error(f"Error fetching proxies: {e}") return None def fetch_proxy(): proxies = [] try: response = requests.get(SINGLE_PROXY_SERVICE_URL) if response.status_code == 200: proxy_list = response.text.strip().split('\r\n') for proxy in proxy_list: proxies.append({ "http": f"http://{proxy}", "https": f"http://{proxy}", }) if proxies: logger.info(f"Fetched proxies: {proxies}") return proxies except Exception as e: logger.error(f"Error fetching proxies: {e}") return None def read_dois_from_files(bib_dir_path): all_dois = [] for bib_file_name in sorted(os.listdir(bib_dir_path)): if bib_file_name.endswith(".bib"): with open(os.path.join(bib_dir_path, bib_file_name), "r") as file: dois = DOI_PATTERN.findall(file.read()) logger.info(f"{bib_file_name} has {len(dois)} doi(s)") all_dois.extend(dois) return list(set(all_dois)) def filter_downloaded_dois(all_dois, output_dir): for doi in os.listdir(output_dir): if doi.endswith(".pdf"): doi = doi.replace(".pdf", "").replace("_", "/") if doi in all_dois: all_dois.remove(doi) return all_dois def read_dois_from_db(db_path, status): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(f"SELECT doi FROM doi_status WHERE status = '{status}'") dois = [row[0] for row in cursor.fetchall()] conn.close() return dois def write_doi_to_db(db_path, doi, output_dirname, status): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute("INSERT OR REPLACE INTO doi_status (doi, status, pdf_url) VALUES (?, ?, ?)", (doi, status, f"{output_dirname}/{doi.replace('/', '_')}.pdf")) conn.commit() conn.close() def initialize_db(db_path): conn = sqlite3.connect(db_path) cursor = conn.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS doi_status ( doi TEXT PRIMARY KEY, status TEXT, pdf_url TEXT ) ''') conn.commit() cursor.execute("PRAGMA journal_mode=WAL") conn.commit() conn.close() def download_doi(doi, output_dir, proxy, scihub_urls, db_path): success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], [] output_dirname = output_dir.split("/")[-1] for scihub_url in scihub_urls: output_path = os.path.join(output_dir, f"{doi.replace('/', '_')}.pdf") proxy_url = "https=" + proxy['https'] try: result = subprocess.run( ['scidownl', 'download', '--doi', doi, '--out', output_path, '--scihub-url', scihub_url, '--proxy', proxy_url], check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True ) logger.info(result.stderr) if "No pdf tag" in result.stderr: timeout_dois.append(doi) write_doi_to_db(db_path, doi, output_dirname, 'timeout') break elif "403" in result.stderr or "Unable to connect to proxy" in result.stderr or "504" in result.stderr or 'crawling_failed, error: HTTPSConnectionPool' in result.stderr: logger.warning("Proxy error detected, fetching new proxy.") proxy = fetch_proxy()[0] # time.sleep(2) continue elif result.stdout.strip() != '': try: # 尝试打开pdf文件 with pymupdf.open(output_path) as pdf: logger.info(f"Downloaded {doi} successfully.") write_doi_to_db(db_path, doi, output_dirname, 'success') success_dois.append(doi) except: write_doi_to_db(db_path, doi, output_dirname, 'broken') logger.info(f"{doi}.pdf has been broken!") broken_dois.append(doi) break else: write_doi_to_db(db_path, doi, output_dirname, 'failed') break except subprocess.CalledProcessError as e: logger.error(f"Error: {e}") failed_dois.append(doi) write_doi_to_db(db_path, doi, 'failed') continue return success_dois, broken_dois, failed_dois, timeout_dois def download_dois(all_dois, output_dir, db_path): success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], [] proxies = fetch_proxies() with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor: futures = [] for i, doi in enumerate(all_dois): proxy = proxies[i % len(proxies)] futures.append(executor.submit(download_doi, doi, output_dir, proxy, SCIHUB_URLS, db_path)) for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='Downloading DOIs', unit='doi'): result = future.result() if result: success, broken, failed, timeout = result success_dois.extend(success) broken_dois.extend(broken) failed_dois.extend(failed) timeout_dois.extend(timeout) logger.info(f"Success: {len(success_dois)}, Broken: {len(broken_dois)}, Failed: {len(failed_dois)}, Timeout: {len(timeout_dois)}") def main(): bib_dir_name = "synthesis23-25" output_dirname = "synthesis23-25_pdfs" input('你确定是文件夹{}和{}吗?'.format(bib_dir_name, output_dirname)) output_dir, bib_dir_path, db_path = get_directories(bib_dir_name, output_dirname) create_directory_if_not_exists(output_dir) initialize_db(db_path) all_dois = read_dois_from_files(bib_dir_path) logger.info(f"Total {len(all_dois)} doi(s)") all_dois = filter_downloaded_dois(all_dois, output_dir) all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'success')] all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'failed')] all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'timeout')] download_dois(all_dois, output_dir, db_path) if __name__ == "__main__": main()