Files
datapipe/reparagraph.py
2025-01-17 23:23:07 +08:00

119 lines
4.4 KiB
Python
Executable File
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import re
import json
from openai import OpenAI
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
MODEL_NAME = "deepseek-chat"
def get_true_level(title_info: list, max_retries: int = 5):
source_title = json.dumps(title_info)
instruction = """
有如下的JSON格式的标题信息,已知他们的标题内容和行号请你在level字段给出正确的层级关系层级关系用数字(1,2,3,4)表示,数字越小,层级越高。
额外的层级关系说明本层级关系要求存在多个1级标题而非仅一个1级标题。
<PLACEHOLDER>
返回结果的时候严格遵守下列示例JSON格式:
{ 'data': [
{ 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
...
]
"""
# 创建 OpenAI 客户端
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
for attempt in range(max_retries):
try:
completion = client.chat.completions.create(
model=MODEL_NAME,
stream=False, # 关闭流模式
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
],
response_format={
'type': 'json_object'
}
)
response = completion.choices[0].message.content
response = json.loads(response)
return response['data']
except (json.JSONDecodeError, Exception) as e:
print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
if attempt == max_retries - 1:
return "Error"
def extract_headings(file_path):
"""提取markdown文件中所有以#开头的行及其行号"""
headings = []
with open(file_path, 'r', encoding='utf-8') as file:
for line_num, line in enumerate(file, 1):
if re.match(r'^#', line.strip()):
headings.append((line_num, line.strip()))
return headings
def extract_references(file_path, headings):
"""提取参考文献部分"""
# 在标题中查找REFERENCE
ref_heading = None
for line_num, heading in headings:
if "REFERENCE" in heading.upper():
ref_heading = (line_num, heading)
break
if not ref_heading:
return None
ref_start = ref_heading[0] - 1 # 转换为0-based索引
# 查找下一个标题或文件结尾
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
ref_end = len(lines)
for i in range(ref_start + 1, len(lines)):
if re.match(r'^#', lines[i].strip()):
ref_end = i
break
# 提取参考文献内容
references = lines[ref_start:ref_end]
return ''.join(references)
def update_headings(file_path, heading_data):
"""根据提供的标题数据更新Markdown文件中的标题"""
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 统计heading_data中level==1的数量
count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3
for heading in heading_data:
line_num = heading['line_num'] - 1
if heading['level'] >= flag:
lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(lines)
if __name__ == "__main__":
file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"
# 提取并更新标题
headings = extract_headings(file_path)
title_info = [{"title": heading, "line_num": line_num, "level": "unknown"}
for line_num, heading in headings]
# result = get_true_level(title_info)
# update_headings(file_path, result)
# 提取参考文献
references = extract_references(file_path, headings)
if references:
print("提取的参考文献:")
print(references)
else:
print("未找到参考文献部分")