119 lines
4.4 KiB
Python
Executable File
119 lines
4.4 KiB
Python
Executable File
import re
|
||
import json
|
||
from openai import OpenAI
|
||
|
||
|
||
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
|
||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||
MODEL_NAME = "deepseek-chat"
|
||
|
||
def get_true_level(title_info: list, max_retries: int = 5):
|
||
source_title = json.dumps(title_info)
|
||
instruction = """
|
||
有如下的JSON格式的标题信息,已知他们的标题内容和行号,请你在level字段给出正确的层级关系,层级关系用数字(1,2,3,4)表示,数字越小,层级越高。
|
||
额外的层级关系说明:本层级关系要求存在多个1级标题而非仅一个1级标题。
|
||
<PLACEHOLDER>
|
||
返回结果的时候严格遵守下列示例JSON格式:
|
||
{ 'data': [
|
||
{ 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
|
||
...
|
||
]
|
||
"""
|
||
# 创建 OpenAI 客户端
|
||
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
|
||
for attempt in range(max_retries):
|
||
try:
|
||
completion = client.chat.completions.create(
|
||
model=MODEL_NAME,
|
||
stream=False, # 关闭流模式
|
||
messages=[
|
||
{"role": "system", "content": "You are a helpful assistant."},
|
||
{"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
|
||
],
|
||
response_format={
|
||
'type': 'json_object'
|
||
}
|
||
)
|
||
|
||
response = completion.choices[0].message.content
|
||
response = json.loads(response)
|
||
return response['data']
|
||
|
||
except (json.JSONDecodeError, Exception) as e:
|
||
print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
|
||
if attempt == max_retries - 1:
|
||
return "Error"
|
||
|
||
|
||
def extract_headings(file_path):
|
||
"""提取markdown文件中所有以#开头的行及其行号"""
|
||
headings = []
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
for line_num, line in enumerate(file, 1):
|
||
if re.match(r'^#', line.strip()):
|
||
headings.append((line_num, line.strip()))
|
||
return headings
|
||
|
||
def extract_references(file_path, headings):
|
||
"""提取参考文献部分"""
|
||
# 在标题中查找REFERENCE
|
||
ref_heading = None
|
||
for line_num, heading in headings:
|
||
if "REFERENCE" in heading.upper():
|
||
ref_heading = (line_num, heading)
|
||
break
|
||
|
||
if not ref_heading:
|
||
return None
|
||
|
||
ref_start = ref_heading[0] - 1 # 转换为0-based索引
|
||
|
||
# 查找下一个标题或文件结尾
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
lines = file.readlines()
|
||
|
||
ref_end = len(lines)
|
||
for i in range(ref_start + 1, len(lines)):
|
||
if re.match(r'^#', lines[i].strip()):
|
||
ref_end = i
|
||
break
|
||
|
||
# 提取参考文献内容
|
||
references = lines[ref_start:ref_end]
|
||
return ''.join(references)
|
||
|
||
def update_headings(file_path, heading_data):
|
||
"""根据提供的标题数据更新Markdown文件中的标题"""
|
||
with open(file_path, 'r', encoding='utf-8') as file:
|
||
lines = file.readlines()
|
||
|
||
# 统计heading_data中level==1的数量
|
||
count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
|
||
flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3
|
||
|
||
for heading in heading_data:
|
||
line_num = heading['line_num'] - 1
|
||
if heading['level'] >= flag:
|
||
lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"
|
||
|
||
with open(file_path, 'w', encoding='utf-8') as file:
|
||
file.writelines(lines)
|
||
|
||
if __name__ == "__main__":
|
||
file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"
|
||
|
||
# 提取并更新标题
|
||
headings = extract_headings(file_path)
|
||
title_info = [{"title": heading, "line_num": line_num, "level": "unknown"}
|
||
for line_num, heading in headings]
|
||
# result = get_true_level(title_info)
|
||
# update_headings(file_path, result)
|
||
|
||
# 提取参考文献
|
||
references = extract_references(file_path, headings)
|
||
if references:
|
||
print("提取的参考文献:")
|
||
print(references)
|
||
else:
|
||
print("未找到参考文献部分")
|