第一次合并clean代码
This commit is contained in:
211
clean/stp2_down_ipidea_multi.py
Normal file
211
clean/stp2_down_ipidea_multi.py
Normal file
@@ -0,0 +1,211 @@
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
import tqdm
|
||||
import requests
|
||||
import subprocess
|
||||
import concurrent.futures
|
||||
import sqlite3
|
||||
from scidownl import scihub_download
|
||||
import logging
|
||||
import pymupdf
|
||||
|
||||
|
||||
NUM_PROCESSES = 32 # 设置并发进程数
|
||||
SCIHUB_URLS = [
|
||||
"https://sci-hub.st/",
|
||||
"https://sci-hub.se/",
|
||||
"https://sci-hub.ru/"
|
||||
]
|
||||
PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num={NUM_PROCESSES}&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
|
||||
SINGLE_PROXY_SERVICE_URL = f"http://api.proxy.ipidea.io/getProxyIp?num=1&tag=static_balance&return_type=txt&lb=1&sb=0&flow=1&protocol=http"
|
||||
DOI_PATTERN = re.compile(r"DOI\s*=\s*\{(10\.\d{4,9}/[-._;()/:A-Z0-9]+)\}", re.IGNORECASE)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format='[%(levelname)s] | %(asctime)s | %(message)s')
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
def get_directories(bib_dir_name, output_dirname):
|
||||
current_path = os.path.dirname(os.path.abspath(__file__))
|
||||
output_dir = os.path.join(current_path, output_dirname)
|
||||
bib_dir_path = os.path.join(current_path, bib_dir_name)
|
||||
db_path = os.path.join(current_path, "doi_status.db")
|
||||
return output_dir, bib_dir_path, db_path
|
||||
|
||||
def create_directory_if_not_exists(directory):
|
||||
os.makedirs(directory, exist_ok=True)
|
||||
|
||||
def fetch_proxies():
|
||||
proxies = []
|
||||
try:
|
||||
response = requests.get(PROXY_SERVICE_URL)
|
||||
if response.status_code == 200:
|
||||
proxy_list = response.text.strip().split('\r\n')
|
||||
for proxy in proxy_list:
|
||||
proxies.append({
|
||||
"http": f"http://{proxy}",
|
||||
"https": f"http://{proxy}",
|
||||
})
|
||||
if proxies:
|
||||
logger.info(f"Fetched proxies: {proxies}")
|
||||
return proxies
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching proxies: {e}")
|
||||
return None
|
||||
|
||||
def fetch_proxy():
|
||||
proxies = []
|
||||
try:
|
||||
response = requests.get(SINGLE_PROXY_SERVICE_URL)
|
||||
if response.status_code == 200:
|
||||
proxy_list = response.text.strip().split('\r\n')
|
||||
for proxy in proxy_list:
|
||||
proxies.append({
|
||||
"http": f"http://{proxy}",
|
||||
"https": f"http://{proxy}",
|
||||
})
|
||||
if proxies:
|
||||
logger.info(f"Fetched proxies: {proxies}")
|
||||
return proxies
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching proxies: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def read_dois_from_files(bib_dir_path):
|
||||
all_dois = []
|
||||
for bib_file_name in sorted(os.listdir(bib_dir_path)):
|
||||
if bib_file_name.endswith(".bib"):
|
||||
with open(os.path.join(bib_dir_path, bib_file_name), "r") as file:
|
||||
dois = DOI_PATTERN.findall(file.read())
|
||||
logger.info(f"{bib_file_name} has {len(dois)} doi(s)")
|
||||
all_dois.extend(dois)
|
||||
return list(set(all_dois))
|
||||
|
||||
def filter_downloaded_dois(all_dois, output_dir):
|
||||
for doi in os.listdir(output_dir):
|
||||
if doi.endswith(".pdf"):
|
||||
doi = doi.replace(".pdf", "").replace("_", "/")
|
||||
if doi in all_dois:
|
||||
all_dois.remove(doi)
|
||||
return all_dois
|
||||
|
||||
def read_dois_from_db(db_path, status):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(f"SELECT doi FROM doi_status WHERE status = '{status}'")
|
||||
dois = [row[0] for row in cursor.fetchall()]
|
||||
conn.close()
|
||||
return dois
|
||||
|
||||
def write_doi_to_db(db_path, doi, output_dirname, status):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("INSERT OR REPLACE INTO doi_status (doi, status, pdf_url) VALUES (?, ?, ?)", (doi, status, f"{output_dirname}/{doi.replace('/', '_')}.pdf"))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def initialize_db(db_path):
|
||||
conn = sqlite3.connect(db_path)
|
||||
cursor = conn.cursor()
|
||||
cursor.execute('''
|
||||
CREATE TABLE IF NOT EXISTS doi_status (
|
||||
doi TEXT PRIMARY KEY,
|
||||
status TEXT,
|
||||
pdf_url TEXT
|
||||
)
|
||||
''')
|
||||
conn.commit()
|
||||
cursor.execute("PRAGMA journal_mode=WAL")
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
def download_doi(doi, output_dir, proxy, scihub_urls, db_path):
|
||||
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
|
||||
output_dirname = output_dir.split("/")[-1]
|
||||
for scihub_url in scihub_urls:
|
||||
output_path = os.path.join(output_dir, f"{doi.replace('/', '_')}.pdf")
|
||||
proxy_url = "https=" + proxy['https']
|
||||
|
||||
try:
|
||||
result = subprocess.run(
|
||||
['scidownl', 'download', '--doi', doi, '--out', output_path, '--scihub-url', scihub_url, '--proxy', proxy_url],
|
||||
check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True
|
||||
)
|
||||
logger.info(result.stderr)
|
||||
|
||||
if "No pdf tag" in result.stderr:
|
||||
timeout_dois.append(doi)
|
||||
write_doi_to_db(db_path, doi, output_dirname, 'timeout')
|
||||
break
|
||||
elif "403" in result.stderr or "Unable to connect to proxy" in result.stderr or "504" in result.stderr or 'crawling_failed, error: HTTPSConnectionPool' in result.stderr:
|
||||
logger.warning("Proxy error detected, fetching new proxy.")
|
||||
proxy = fetch_proxy()[0]
|
||||
# time.sleep(2)
|
||||
continue
|
||||
elif result.stdout.strip() != '':
|
||||
try:
|
||||
# 尝试打开pdf文件
|
||||
with pymupdf.open(output_path) as pdf:
|
||||
logger.info(f"Downloaded {doi} successfully.")
|
||||
write_doi_to_db(db_path, doi, output_dirname, 'success')
|
||||
success_dois.append(doi)
|
||||
except:
|
||||
write_doi_to_db(db_path, doi, output_dirname, 'broken')
|
||||
logger.info(f"{doi}.pdf has been broken!")
|
||||
broken_dois.append(doi)
|
||||
break
|
||||
else:
|
||||
write_doi_to_db(db_path, doi, output_dirname, 'failed')
|
||||
break
|
||||
|
||||
except subprocess.CalledProcessError as e:
|
||||
logger.error(f"Error: {e}")
|
||||
failed_dois.append(doi)
|
||||
write_doi_to_db(db_path, doi, 'failed')
|
||||
continue
|
||||
|
||||
return success_dois, broken_dois, failed_dois, timeout_dois
|
||||
|
||||
def download_dois(all_dois, output_dir, db_path):
|
||||
success_dois, broken_dois, failed_dois, timeout_dois = [], [], [], []
|
||||
proxies = fetch_proxies()
|
||||
|
||||
with concurrent.futures.ProcessPoolExecutor(max_workers=NUM_PROCESSES) as executor:
|
||||
futures = []
|
||||
for i, doi in enumerate(all_dois):
|
||||
proxy = proxies[i % len(proxies)]
|
||||
futures.append(executor.submit(download_doi, doi, output_dir, proxy, SCIHUB_URLS, db_path))
|
||||
|
||||
for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures), desc='Downloading DOIs', unit='doi'):
|
||||
result = future.result()
|
||||
if result:
|
||||
success, broken, failed, timeout = result
|
||||
success_dois.extend(success)
|
||||
broken_dois.extend(broken)
|
||||
failed_dois.extend(failed)
|
||||
timeout_dois.extend(timeout)
|
||||
|
||||
logger.info(f"Success: {len(success_dois)}, Broken: {len(broken_dois)}, Failed: {len(failed_dois)}, Timeout: {len(timeout_dois)}")
|
||||
|
||||
def main():
|
||||
bib_dir_name = "synthesis23-25"
|
||||
output_dirname = "synthesis23-25_pdfs"
|
||||
input('你确定是文件夹{}和{}吗?'.format(bib_dir_name, output_dirname))
|
||||
output_dir, bib_dir_path, db_path = get_directories(bib_dir_name, output_dirname)
|
||||
create_directory_if_not_exists(output_dir)
|
||||
|
||||
initialize_db(db_path)
|
||||
|
||||
all_dois = read_dois_from_files(bib_dir_path)
|
||||
logger.info(f"Total {len(all_dois)} doi(s)")
|
||||
|
||||
all_dois = filter_downloaded_dois(all_dois, output_dir)
|
||||
|
||||
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'success')]
|
||||
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'failed')]
|
||||
all_dois = [doi for doi in all_dois if doi not in read_dois_from_db(db_path, 'timeout')]
|
||||
|
||||
download_dois(all_dois, output_dir, db_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user