211 lines
8.0 KiB
Python
211 lines
8.0 KiB
Python
import os
|
|
import re
|
|
import time
|
|
import tqdm
|
|
import requests
|
|
import subprocess
|
|
import concurrent.futures
|
|
import sqlite3
|
|
from scidownl import scihub_download
|
|
import logging
|
|
import pymupdf
|
|
|
|
|
|
NUM_PROCESSES = 32 # 设置并发进程数
|
|
SCIHUB_URLS = [
|
|
"https://sci-hub.st/",
|
|
"https://sci-hub.se/",
|
|
"https://sci-hub.ru/"
|
|
]
|
|
PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num={NUM_PROCESSES}&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
|
|
SINGLE_PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num=1&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
|
|
DOI_PATTERN = re.compile(r"DOI\s*=\s*\{(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\}", re.IGNORECASE)
|
|
|
|
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] | %(asctime)s | %(message)s')
|
|
logger = logging.getLogger(__name__)
|
|
|
|
def get_directories(bib_dir_name, output_dirname):
|
|
current_path = os.path.dirname(os.path.abspath(__file__))
|
|
output_dir = os.path.join(current_path, output_dirname)
|
|
bib_dir_path = os.path.join(current_path, bib_dir_name)
|
|
db_path = os.path.join(current_path, "doi_status.db")
|
|
return output_dir, bib_dir_path, db_path
|
|
|
|
def create_directory_if_not_exists(directory):
|
|
os.makedirs(directory, exist_ok=True)
|
|
|
|
def fetch_proxies():
|
|
proxies = []
|
|
try:
|
|
response = requests.get(PROXY_SERVICE_URL)
|
|
if response.status_code == 200:
|
|
proxy_list = response.text.strip().split('\r\n')
|
|
for proxy in proxy_list:
|
|
proxies.append({
|
|
"http": f"http://{proxy}",
|
|
"https": f"http://{proxy}",
|
|
})
|
|
if proxies:
|
|
logger.info(f"Fetched proxies: {proxies}")
|
|
return proxies
|
|
except Exception as e:
|
|
logger.error(f"Error fetching proxies: {e}")
|
|
return None
|
|
|
|
def fetch_proxy():
|
|
proxies = []
|
|
try:
|
|
response = requests.get(SINGLE_PROXY_SERVICE_URL)
|
|
if response.status_code == 200:
|
|
proxy_list = response.text.strip().split('\r\n')
|
|
for proxy in proxy_list:
|
|
proxies.append({
|
|
"http": f"http://{proxy}",
|
|
"https": f"http://{proxy}",
|
|
})
|
|
if proxies:
|
|
logger.info(f"Fetched proxies: {proxies}")
|
|
return proxies
|
|
except Exception as e:
|
|
logger.error(f"Error fetching proxies: {e}")
|
|
return None
|
|
|
|
|
|
def read_dois_from_files(bib_dir_path):
|
|
all_dois = []
|
|
for bib_file_name in sorted(os.listdir(bib_dir_path)):
|
|
if bib_file_name.endswith(".bib"):
|
|
with open(os.path.join(bib_dir_path, bib_file_name), "r") as file:
|
|
dois = DOI_PATTERN.findall(file.read())
|
|
logger.info(f"{bib_file_name} has {len(dois)} doi(s)")
|
|
all_dois.extend(dois)
|
|
return list(set(all_dois))
|
|
|
|
def filter_downloaded_dois(all_dois, output_dir):
|
|
for doi in os.listdir(output_dir):
|
|
if doi.endswith(".pdf"):
|
|
doi = doi.replace(".pdf", "").replace("_", "/")
|
|
if doi in all_dois:
|
|
all_dois.remove(doi)
|
|
return all_dois
|
|
|
|
def read_dois_from_db(db_path, status):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute(f"SELECT doi FROM doi_status WHERE status = '{status}'")
|
|
dois = [row[0] for row in cursor.fetchall()]
|
|
conn.close()
|
|
return dois
|
|
|
|
def write_doi_to_db(db_path, doi, output_dirname, status):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute("INSERT OR REPLACE INTO doi_status (doi, status, pdf_url) VALUES (?, ?, ?)", (doi, status, f"{output_dirname}/{doi.replace('/', '_')}.pdf"))
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def initialize_db(db_path):
|
|
conn = sqlite3.connect(db_path)
|
|
cursor = conn.cursor()
|
|
cursor.execute('''
|
|
CREATE TABLE IF NOT EXISTS doi_status (
|
|
doi TEXT PRIMARY KEY,
|
|
status TEXT,
|
|
pdf_url TEXT
|
|
)
|
|
''')
|
|
conn.commit()
|
|
cursor.execute("PRAGMA journal_mode=WAL")
|
|
conn.commit()
|
|
conn.close()
|
|
|
|
def download_doi(doi, output_dir, proxy, scihub_urls, db_path):
|
|
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
|
|
output_dirname = output_dir.split("/")[-1]
|
|
for scihub_url in scihub_urls:
|
|
output_path = os.path.join(output_dir, f"{doi.replace('/', '_')}.pdf")
|
|
proxy_url = "https=" + proxy['https']
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
['scidownl', 'download', '--doi', doi, '--out', output_path, '--scihub-url', scihub_url, '--proxy', proxy_url],
|
|
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
|
)
|
|
logger.info(result.stderr)
|
|
|
|
if "No pdf tag" in result.stderr:
|
|
timeout_dois.append(doi)
|
|
write_doi_to_db(db_path, doi, output_dirname, 'timeout')
|
|
break
|
|
elif "403" in result.stderr or "Unable to connect to proxy" in result.stderr or "504" in result.stderr or 'crawling_failed, error: HTTPSConnectionPool' in result.stderr:
|
|
logger.warning("Proxy error detected, fetching new proxy.")
|
|
proxy = fetch_proxy()[0]
|
|
# time.sleep(2)
|
|
continue
|
|
elif result.stdout.strip() != '':
|
|
try:
|
|
# 尝试打开pdf文件
|
|
with pymupdf.open(output_path) as pdf:
|
|
logger.info(f"Downloaded {doi} successfully.")
|
|
write_doi_to_db(db_path, doi, output_dirname, 'success')
|
|
success_dois.append(doi)
|
|
except:
|
|
write_doi_to_db(db_path, doi, output_dirname, 'broken')
|
|
logger.info(f"{doi}.pdf has been broken!")
|
|
broken_dois.append(doi)
|
|
break
|
|
else:
|
|
write_doi_to_db(db_path, doi, output_dirname, 'failed')
|
|
break
|
|
|
|
except subprocess.CalledProcessError as e:
|
|
logger.error(f"Error: {e}")
|
|
failed_dois.append(doi)
|
|
write_doi_to_db(db_path, doi, 'failed')
|
|
continue
|
|
|
|
return success_dois, broken_dois, failed_dois, timeout_dois
|
|
|
|
def download_dois(all_dois, output_dir, db_path):
|
|
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
|
|
proxies = fetch_proxies()
|
|
|
|
with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
|
|
futures = []
|
|
for i, doi in enumerate(all_dois):
|
|
proxy = proxies[i % len(proxies)]
|
|
futures.append(executor.submit(download_doi, doi, output_dir, proxy, SCIHUB_URLS, db_path))
|
|
|
|
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='Downloading DOIs', unit='doi'):
|
|
result = future.result()
|
|
if result:
|
|
success, broken, failed, timeout = result
|
|
success_dois.extend(success)
|
|
broken_dois.extend(broken)
|
|
failed_dois.extend(failed)
|
|
timeout_dois.extend(timeout)
|
|
|
|
logger.info(f"Success: {len(success_dois)}, Broken: {len(broken_dois)}, Failed: {len(failed_dois)}, Timeout: {len(timeout_dois)}")
|
|
|
|
def main():
|
|
bib_dir_name = "synthesis23-25"
|
|
output_dirname = "synthesis23-25_pdfs"
|
|
input('你确定是文件夹{}和{}吗?'.format(bib_dir_name, output_dirname))
|
|
output_dir, bib_dir_path, db_path = get_directories(bib_dir_name, output_dirname)
|
|
create_directory_if_not_exists(output_dir)
|
|
|
|
initialize_db(db_path)
|
|
|
|
all_dois = read_dois_from_files(bib_dir_path)
|
|
logger.info(f"Total {len(all_dois)} doi(s)")
|
|
|
|
all_dois = filter_downloaded_dois(all_dois, output_dir)
|
|
|
|
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'success')]
|
|
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'failed')]
|
|
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'timeout')]
|
|
|
|
download_dois(all_dois, output_dir, db_path)
|
|
|
|
if __name__ == "__main__":
|
|
main() |