第一次合并clean代码

2025-01-18 17:09:51 +08:00
parent e33a8b069e
commit a0f5ca9a35
21 changed files with 2252 additions and 375 deletions
--- a/clean/preprocess_mineru.py
+++ b/clean/preprocess_mineru.py
@@ -0,0 +1,273 @@
+import re
+import os
+import json
+import copy
+import requests
+import time
+import sqlite3
+import PyPDF2
+import multiprocessing
+import mysql.connector
+
+from loguru import logger
+from glob import glob
+from tqdm import tqdm
+
+from magic_pdf.pipe.UNIPipe import UNIPipe
+from magic_pdf.pipe.OCRPipe import OCRPipe
+from magic_pdf.pipe.TXTPipe import TXTPipe
+from magic_pdf.rw.DiskReaderWriter import DiskReaderWriter
+import magic_pdf.model as model_config
+
+model_config.__use_inside_model__ = True
+
+# 图床配置
+IMGBED_URL = "http://localhost:40027/"
+# 检查imgbed url是否以/结尾
+if not IMGBED_URL.endswith('/'):
+    IMGBED_URL += '/'
+token_endpoint = f"{IMGBED_URL}api/v1/tokens"
+upload_endpoint = f"{IMGBED_URL}api/v1/upload"
+
+# 通过如下方式获取token
+# curl -X POST http://localhost:40027/api/v1/tokens -H "Content-Type: application/json" -d '{"email":"yt.li2@siat.ac.cn", "password":"lyt20000414."}'
+IMGBED_TOKEN = "6|QsBh5H7txY3Hd7ju1nzYKOBSdFQeL0YberydSFIH"
+
+def replace_image_links(md_content: str, images_urls: dict) -> str:
+    # 匹配 Markdown 中的图像链接形式，即: ![alt text](image_path)
+    pattern = r'!\[(.*?)\]\((.*?)\)'
+
+    def replace_link(match):
+        # 提取出当前匹配到的图片路径
+        image_path = match.group(2)
+        # 检查该路径是否在字典中
+        if image_path in images_urls:
+            # 从字典中获取新的 URL
+            new_url = images_urls[image_path]
+            return f"![]({new_url})"
+        return match.group(0)
+
+    # 使用 sub 函数进行替换
+    updated_md_content = re.sub(pattern, replace_link, md_content)
+    return updated_md_content
+
+# 上传图片到LSKY Pro
+def upload_image(img_dir):
+    headers = {
+        "Authorization": f"Bearer {IMGBED_TOKEN}",
+        'Accept': 'application/json'
+    }
+    
+    image_urls = {}
+    os.makedirs(img_dir, exist_ok=True)
+    img_names = os.listdir(img_dir)
+    for image_name in img_names:
+        retry = 0
+        image_path = os.path.join(img_dir, image_name)
+        while retry < 5:  # 最大重试次数
+            try:
+                with open(image_path, 'rb') as image_file:  # 确保文件在上传时是打开状态
+                    files = {'file': image_file}
+
+                    # 上传文件
+                    response = requests.post(upload_endpoint, headers=headers, files=files)
+                    if response.status_code == 200:
+                        result = response.json()
+                        if result['status']:
+                            image_url = result['data']['links']['url']
+                            image_urls['images/'+image_name] = image_url
+                            break  # 上传成功，退出重试循环
+                        else:
+                            raise Exception(f"图片上传失败: {result['message']}")
+                    elif response.status_code == 429:
+                        # 429 响应，等待一段时间再重试
+                        wait_time = min(2 ** retry, 60)  # 指数退避，最大等待 60 秒
+                        logger.warning(f"请求过于频繁，等待 {wait_time} 秒...")
+                        time.sleep(wait_time)
+                    else:
+                        raise Exception(f"HTTP请求出错: {response.status_code}")
+                    
+                    retry += 1  # 增加重试次数
+                    time.sleep(1)  # 在重试失败后稍等一下
+
+            except FileNotFoundError:
+                logger.error(f"文件 {image_path} 不存在，请检查路径是否正确")
+                return
+
+    return image_urls
+
+def json_md_dump(
+        pipe,
+        md_writer,
+        pdf_name,
+        content_list,
+        md_content,
+):
+    # 写入模型结果到 model.json
+    orig_model_list = copy.deepcopy(pipe.model_list)
+    md_writer.write(
+        content=json.dumps(orig_model_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_model.json"
+    )
+
+    # 写入中间结果到 middle.json
+    md_writer.write(
+        content=json.dumps(pipe.pdf_mid_data, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_middle.json"
+    )
+
+    # text文本结果写入到 conent_list.json
+    md_writer.write(
+        content=json.dumps(content_list, ensure_ascii=False, indent=4),
+        path=f"{pdf_name}_content_list.json"
+    )
+
+    # 写入结果到 .md 文件中
+    md_writer.write(
+        content=md_content,
+        path=f"{pdf_name}.md"
+    )
+
+def pdf_parse_main(
+        pdf_path: str,
+        parse_method: str = 'auto',
+        model_json_path: str = None,
+        is_json_md_dump: bool = True,
+        output_dir: str = None
+):
+    """
+    执行从 pdf 转换到 json、md 的过程，输出 md 和 json 文件到 pdf 文件所在的目录
+
+    :param pdf_path: .pdf 文件的路径，可以是相对路径，也可以是绝对路径
+    :param parse_method: 解析方法， 共 auto、ocr、txt 三种，默认 auto，如果效果不好，可以尝试 ocr
+    :param model_json_path: 已经存在的模型数据文件，如果为空则使用内置模型，pdf 和 model_json 务必对应
+    :param is_json_md_dump: 是否将解析后的数据写入到 .json 和 .md 文件中，默认 True，会将不同阶段的数据写入到不同的 .json 文件中（共3个.json文件），md内容会保存到 .md 文件中
+    :param output_dir: 输出结果的目录地址，会生成一个以 pdf 文件名命名的文件夹并保存所有结果
+    """
+    try:
+        pdf_name = os.path.basename(pdf_path).split("/")[-1].replace(".pdf", "")
+        pdf_path_parent = os.path.dirname(pdf_path)
+
+        if output_dir:
+            output_path = os.path.join(output_dir, pdf_name)
+        else:
+            output_path = os.path.join(pdf_path_parent, pdf_name)
+
+        output_image_path = os.path.join(output_path, 'images')
+
+        # 获取图片的父路径，为的是以相对路径保存到 .md 和 conent_list.json 文件中
+        image_path_parent = os.path.basename(output_image_path)
+
+        pdf_bytes = open(pdf_path, "rb").read()  # 读取 pdf 文件的二进制数据
+
+        if model_json_path:
+            # 读取已经被模型解析后的pdf文件的 json 原始数据，list 类型
+            model_json = json.loads(open(model_json_path, "r", encoding="utf-8").read())
+        else:
+            model_json = []
+
+        # 执行解析步骤
+        # image_writer = DiskReaderWriter(output_image_path)
+        image_writer, md_writer = DiskReaderWriter(output_image_path), DiskReaderWriter(output_path)
+
+        # 选择解析方式
+        # jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+        # pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+        if parse_method == "auto":
+            jso_useful_key = {"_pdf_type": "", "model_list": model_json}
+            pipe = UNIPipe(pdf_bytes, jso_useful_key, image_writer)
+        elif parse_method == "txt":
+            pipe = TXTPipe(pdf_bytes, model_json, image_writer)
+        elif parse_method == "ocr":
+            pipe = OCRPipe(pdf_bytes, model_json, image_writer)
+        else:
+            logger.error("unknown parse method, only auto, ocr, txt allowed")
+            exit(1)
+
+        # 执行分类
+        pipe.pipe_classify()
+
+        # 如果没有传入模型数据，则使用内置模型解析
+        if not model_json:
+            if model_config.__use_inside_model__:
+                pipe.pipe_analyze()  # 解析
+            else:
+                logger.error("need model list input")
+                exit(1)
+
+        # 执行解析
+        pipe.pipe_parse()
+
+        # 保存 text 和 md 格式的结果
+        content_list = pipe.pipe_mk_uni_format(image_path_parent, drop_mode="none")
+        md_content = pipe.pipe_mk_markdown(image_path_parent, drop_mode="none")
+        # 上传图像到图床
+        image_urls = upload_image(output_image_path)
+        md_content = replace_image_links(md_content, image_urls)
+
+        if is_json_md_dump:
+            json_md_dump(pipe, md_writer, pdf_name, content_list, md_content)
+        return 'sucess'
+
+    except Exception as e:
+        logger.exception(e)
+        return 'error'
+
+def init_worker(devices, pdfs, gpu_index):
+    """
+    Initialize a worker process to process a chunk of PDFs with a specific GPU.
+    """
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_index)
+    process_pdf_chunk(pdfs, gpu_index)
+
+def process_pdf_chunk(pdf_paths, worker_id):
+    for pdf_path in tqdm(pdf_paths, desc=f"Worker {worker_id} Progress"):
+        try:
+            with open(pdf_path, 'rb') as file:
+                pdf_reader = PyPDF2.PdfReader(file)
+                print(os.path.basename(pdf_path).replace(".pdf", "").replace('_', '/'))
+                status = pdf_parse_main(pdf_path, parse_method='auto', output_dir=output_dir)
+        except PyPDF2.errors.PdfReadError:
+            logger.error(f"{pdf_path} has been broken")
+        except Exception as e:
+            logger.error(f"{pdf_path} has an error: {e}")
+
+def multiprocessing_setup(pdf_paths, num_gpus):
+    num_processes_per_gpu = 2
+    chunk_size = len(pdf_paths) // (num_gpus * num_processes_per_gpu)
+    processes = []
+
+    # Create processes for each GPU
+    for gpu_id in range(num_gpus):
+        for process_id in range(num_processes_per_gpu):
+            start_idx = (gpu_id * num_processes_per_gpu + process_id) * chunk_size
+            end_idx = None if (gpu_id == num_gpus - 1 and process_id == num_processes_per_gpu - 1) else start_idx + chunk_size
+            chunk = pdf_paths[start_idx:end_idx]
+            
+            p = multiprocessing.Process(target=init_worker, args=([gpu_id], chunk, gpu_id))
+            processes.append(p)
+            p.start()
+
+    # Ensure all processes have completed
+    for p in processes:
+        p.join()
+
+if __name__ == "__main__":
+    _cur_dir = os.path.dirname(os.path.abspath(__file__))
+    # 此处更改路径
+    pdf_dir = os.path.join(_cur_dir, "black_phosphorus_wulie/黑磷文献/黑磷文献-任务1-推荐官能团")
+    output_dir = os.path.join(_cur_dir, "black_phosphorus_wulie/黑磷文献-任务1-推荐官能团_pdf2md")
+
+    os.makedirs(output_dir, exist_ok=True)
+    pdf_paths = sorted(glob(os.path.join(pdf_dir, "*.pdf")))
+
+    print("pdf数量：", len(pdf_paths))
+
+    # Number of GPUs
+    num_gpus = 8
+
+    # Setup multiprocessing to handle PDFs across multiple GPUs
+    # multiprocessing_setup(pdf_paths, num_gpus)
+
+    pdf_path = "/home/ubuntu/sas0/LYT/paper_dataset/black_phosphorus_wulie/黑磷文献/黑磷文献-任务1-推荐官能团/（P-O,P-O-P）Supporting_information.pdf"
+    pdf_parse_main(pdf_path, parse_method='auto', output_dir=output_dir)