Files
datapipe/clean/preprocess_mineru.py
2025-01-18 17:09:51 +08:00

274 lines
10 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import os
import json
import copy
import requests
import time
import sqlite3
import PyPDF2
import multiprocessing
import mysql.connector
from loguru import logger
from glob import glob
from tqdm import tqdm
from magic_pdf.pipe.UNIPipe import UNIPipe
from magic_pdf.pipe.OCRPipe import OCRPipe
from magic_pdf.pipe.TXTPipe import TXTPipe
from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
import magic_pdf.model as model_config
model_config.__use_inside_model__ = True
# 图床配置
IMGBED_URL = "http://localhost:40027/"
# 检查imgbed url是否以/结尾
if not IMGBED_URL.endswith('/'):
IMGBED_URL += '/'
token_endpoint = f"{IMGBED_URL}api/v1/tokens"
upload_endpoint = f"{IMGBED_URL}api/v1/upload"
# 通过如下方式获取token
# curl -X POST http://localhost:40027/api/v1/tokens -H "Content-Type: application/json" -d '{"email":"yt.li2@siat.ac.cn", "password":"lyt20000414."}'
IMGBED_TOKEN = "6|QsBh5H7txY3Hd7ju1nzYKOBSdFQeL0YberydSFIH"
def replace_image_links(md_content: str, images_urls: dict) -> str:
# 匹配 Markdown 中的图像链接形式,即: ![alt text](image_path)
pattern = r'!\[(.*?)\]\((.*?)\)'
def replace_link(match):
# 提取出当前匹配到的图片路径
image_path = match.group(2)
# 检查该路径是否在字典中
if image_path in images_urls:
# 从字典中获取新的 URL
new_url = images_urls[image_path]
return f"![]({new_url})"
return match.group(0)
# 使用 sub 函数进行替换
updated_md_content = re.sub(pattern, replace_link, md_content)
return updated_md_content
# 上传图片到LSKY Pro
def upload_image(img_dir):
headers = {
"Authorization": f"Bearer {IMGBED_TOKEN}",
'Accept': 'application/json'
}
image_urls = {}
os.makedirs(img_dir, exist_ok=True)
img_names = os.listdir(img_dir)
for image_name in img_names:
retry = 0
image_path = os.path.join(img_dir, image_name)
while retry < 5: # 最大重试次数
try:
with open(image_path, 'rb') as image_file: # 确保文件在上传时是打开状态
files = {'file': image_file}
# 上传文件
response = requests.post(upload_endpoint, headers=headers, files=files)
if response.status_code == 200:
result = response.json()
if result['status']:
image_url = result['data']['links']['url']
image_urls['images/'+image_name] = image_url
break # 上传成功,退出重试循环
else:
raise Exception(f"图片上传失败: {result['message']}")
elif response.status_code == 429:
# 429 响应,等待一段时间再重试
wait_time = min(2 ** retry, 60) # 指数退避,最大等待 60 秒
logger.warning(f"请求过于频繁,等待 {wait_time} 秒...")
time.sleep(wait_time)
else:
raise Exception(f"HTTP请求出错: {response.status_code}")
retry += 1 # 增加重试次数
time.sleep(1) # 在重试失败后稍等一下
except FileNotFoundError:
logger.error(f"文件 {image_path} 不存在,请检查路径是否正确")
return
return image_urls
def json_md_dump(
pipe,
md_writer,
pdf_name,
content_list,
md_content,
):
# 写入模型结果到 model.json
orig_model_list = copy.deepcopy(pipe.model_list)
md_writer.write(
content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
path=f"{pdf_name}_model.json"
)
# 写入中间结果到 middle.json
md_writer.write(
content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
path=f"{pdf_name}_middle.json"
)
# text文本结果写入到 conent_list.json
md_writer.write(
content=json.dumps(content_list, ensure_ascii=False, indent=4),
path=f"{pdf_name}_content_list.json"
)
# 写入结果到 .md 文件中
md_writer.write(
content=md_content,
path=f"{pdf_name}.md"
)
def pdf_parse_main(
pdf_path: str,
parse_method: str = 'auto',
model_json_path: str = None,
is_json_md_dump: bool = True,
output_dir: str = None
):
"""
执行从 pdf 转换到 json、md 的过程,输出 md 和 json 文件到 pdf 文件所在的目录
:param pdf_path: .pdf 文件的路径,可以是相对路径,也可以是绝对路径
:param parse_method: 解析方法, 共 auto、ocr、txt 三种,默认 auto如果效果不好可以尝试 ocr
:param model_json_path: 已经存在的模型数据文件如果为空则使用内置模型pdf 和 model_json 务必对应
:param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中,默认 True会将不同阶段的数据写入到不同的 .json 文件中共3个.json文件md内容会保存到 .md 文件中
:param output_dir: 输出结果的目录地址,会生成一个以 pdf 文件名命名的文件夹并保存所有结果
"""
try:
pdf_name = os.path.basename(pdf_path).split("/")[-1].replace(".pdf", "")
pdf_path_parent = os.path.dirname(pdf_path)
if output_dir:
output_path = os.path.join(output_dir, pdf_name)
else:
output_path = os.path.join(pdf_path_parent, pdf_name)
output_image_path = os.path.join(output_path, 'images')
# 获取图片的父路径,为的是以相对路径保存到 .md 和 conent_list.json 文件中
image_path_parent = os.path.basename(output_image_path)
pdf_bytes = open(pdf_path, "rb").read() # 读取 pdf 文件的二进制数据
if model_json_path:
# 读取已经被模型解析后的pdf文件的 json 原始数据list 类型
model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
else:
model_json = []
# 执行解析步骤
# image_writer = DiskReaderWriter(output_image_path)
image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
# 选择解析方式
# jso_useful_key = {"_pdf_type": "", "model_list": model_json}
# pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
if parse_method == "auto":
jso_useful_key = {"_pdf_type": "", "model_list": model_json}
pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
elif parse_method == "txt":
pipe = TXTPipe(pdf_bytes, model_json, image_writer)
elif parse_method == "ocr":
pipe = OCRPipe(pdf_bytes, model_json, image_writer)
else:
logger.error("unknown parse method, only auto, ocr, txt allowed")
exit(1)
# 执行分类
pipe.pipe_classify()
# 如果没有传入模型数据,则使用内置模型解析
if not model_json:
if model_config.__use_inside_model__:
pipe.pipe_analyze() # 解析
else:
logger.error("need model list input")
exit(1)
# 执行解析
pipe.pipe_parse()
# 保存 text 和 md 格式的结果
content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
# 上传图像到图床
image_urls = upload_image(output_image_path)
md_content = replace_image_links(md_content, image_urls)
if is_json_md_dump:
json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
return 'sucess'
except Exception as e:
logger.exception(e)
return 'error'
def init_worker(devices, pdfs, gpu_index):
"""
Initialize a worker process to process a chunk of PDFs with a specific GPU.
"""
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_index)
process_pdf_chunk(pdfs, gpu_index)
def process_pdf_chunk(pdf_paths, worker_id):
for pdf_path in tqdm(pdf_paths, desc=f"Worker {worker_id} Progress"):
try:
with open(pdf_path, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
print(os.path.basename(pdf_path).replace(".pdf", "").replace('_', '/'))
status = pdf_parse_main(pdf_path, parse_method='auto', output_dir=output_dir)
except PyPDF2.errors.PdfReadError:
logger.error(f"{pdf_path} has been broken")
except Exception as e:
logger.error(f"{pdf_path} has an error: {e}")
def multiprocessing_setup(pdf_paths, num_gpus):
num_processes_per_gpu = 2
chunk_size = len(pdf_paths) // (num_gpus * num_processes_per_gpu)
processes = []
# Create processes for each GPU
for gpu_id in range(num_gpus):
for process_id in range(num_processes_per_gpu):
start_idx = (gpu_id * num_processes_per_gpu + process_id) * chunk_size
end_idx = None if (gpu_id == num_gpus - 1 and process_id == num_processes_per_gpu - 1) else start_idx + chunk_size
chunk = pdf_paths[start_idx:end_idx]
p = multiprocessing.Process(target=init_worker, args=([gpu_id], chunk, gpu_id))
processes.append(p)
p.start()
# Ensure all processes have completed
for p in processes:
p.join()
if __name__ == "__main__":
_cur_dir = os.path.dirname(os.path.abspath(__file__))
# 此处更改路径
pdf_dir = os.path.join(_cur_dir, "black_phosphorus_wulie/黑磷文献/黑磷文献-任务1-推荐官能团")
output_dir = os.path.join(_cur_dir, "black_phosphorus_wulie/黑磷文献-任务1-推荐官能团_pdf2md")
os.makedirs(output_dir, exist_ok=True)
pdf_paths = sorted(glob(os.path.join(pdf_dir, "*.pdf")))
print("pdf数量", len(pdf_paths))
# Number of GPUs
num_gpus = 8
# Setup multiprocessing to handle PDFs across multiple GPUs
# multiprocessing_setup(pdf_paths, num_gpus)
pdf_path = "/home/ubuntu/sas0/LYT/paper_dataset/black_phosphorus_wulie/黑磷文献/黑磷文献-任务1-推荐官能团/P-O,P-O-PSupporting_information.pdf"
pdf_parse_main(pdf_path, parse_method='auto', output_dir=output_dir)