datapipe/reparagraph.py

import re
import json
from openai import OpenAI


OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
MODEL_NAME = "deepseek-chat"

def get_true_level(title_info: list, max_retries: int = 5):
    source_title = json.dumps(title_info)
    instruction = """
    有如下的JSON格式的标题信息,已知他们的标题内容和行号，请你在level字段给出正确的层级关系，层级关系用数字(1,2,3,4)表示，数字越小，层级越高。
    额外的层级关系说明：本层级关系要求存在多个1级标题而非仅一个1级标题。
    <PLACEHOLDER>
    返回结果的时候严格遵守下列示例JSON格式:
    { 'data': [
        { 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
        ...
    ]
    """
    # 创建 OpenAI 客户端
    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
    for attempt in range(max_retries):
        try:
            completion = client.chat.completions.create(
                model=MODEL_NAME,
                stream=False,  # 关闭流模式
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
                ],
                response_format={
                'type': 'json_object'
                }
            )

            response = completion.choices[0].message.content
            response = json.loads(response)
            return response['data']

        except (json.JSONDecodeError, Exception) as e:
            print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
            if attempt == max_retries - 1:
                return "Error"


def extract_headings(file_path):
    """提取markdown文件中所有以#开头的行及其行号"""
    headings = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line_num, line in enumerate(file, 1):
            if re.match(r'^#', line.strip()):
                headings.append((line_num, line.strip()))
    return headings

def extract_references(file_path, headings):
    """提取参考文献部分"""
    # 在标题中查找REFERENCE
    ref_heading = None
    for line_num, heading in headings:
        if "REFERENCE" in heading.upper():
            ref_heading = (line_num, heading)
            break

    if not ref_heading:
        return None

    ref_start = ref_heading[0] - 1  # 转换为0-based索引

    # 查找下一个标题或文件结尾
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    ref_end = len(lines)
    for i in range(ref_start + 1, len(lines)):
        if re.match(r'^#', lines[i].strip()):
            ref_end = i
            break

    # 提取参考文献内容
    references = lines[ref_start:ref_end]
    return ''.join(references)

def update_headings(file_path, heading_data):
    """根据提供的标题数据更新Markdown文件中的标题"""
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # 统计heading_data中level==1的数量
    count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
    flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3

    for heading in heading_data:
        line_num = heading['line_num'] - 1
        if heading['level'] >= flag:
            lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"

    with open(file_path, 'w', encoding='utf-8') as file:
        file.writelines(lines)

if __name__ == "__main__":
    file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"

    # 提取并更新标题
    headings = extract_headings(file_path)
    title_info = [{"title": heading, "line_num": line_num, "level": "unknown"}
                 for line_num, heading in headings]
    # result = get_true_level(title_info)
    # update_headings(file_path, result)

    # 提取参考文献
    references = extract_references(file_path, headings)
    if references:
        print("提取的参考文献：")
        print(references)
    else:
        print("未找到参考文献部分")