first update
This commit is contained in:
118
reparagraph.py
Executable file
118
reparagraph.py
Executable file
@@ -0,0 +1,118 @@
|
||||
import re
|
||||
import json
|
||||
from openai import OpenAI
|
||||
|
||||
|
||||
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
|
||||
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
|
||||
MODEL_NAME = "deepseek-chat"
|
||||
|
||||
def get_true_level(title_info: list, max_retries: int = 5):
|
||||
source_title = json.dumps(title_info)
|
||||
instruction = """
|
||||
有如下的JSON格式的标题信息,已知他们的标题内容和行号,请你在level字段给出正确的层级关系,层级关系用数字(1,2,3,4)表示,数字越小,层级越高。
|
||||
额外的层级关系说明:本层级关系要求存在多个1级标题而非仅一个1级标题。
|
||||
<PLACEHOLDER>
|
||||
返回结果的时候严格遵守下列示例JSON格式:
|
||||
{ 'data': [
|
||||
{ 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
|
||||
...
|
||||
]
|
||||
"""
|
||||
# 创建 OpenAI 客户端
|
||||
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
completion = client.chat.completions.create(
|
||||
model=MODEL_NAME,
|
||||
stream=False, # 关闭流模式
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
|
||||
],
|
||||
response_format={
|
||||
'type': 'json_object'
|
||||
}
|
||||
)
|
||||
|
||||
response = completion.choices[0].message.content
|
||||
response = json.loads(response)
|
||||
return response['data']
|
||||
|
||||
except (json.JSONDecodeError, Exception) as e:
|
||||
print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
|
||||
if attempt == max_retries - 1:
|
||||
return "Error"
|
||||
|
||||
|
||||
def extract_headings(file_path):
|
||||
"""提取markdown文件中所有以#开头的行及其行号"""
|
||||
headings = []
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
for line_num, line in enumerate(file, 1):
|
||||
if re.match(r'^#', line.strip()):
|
||||
headings.append((line_num, line.strip()))
|
||||
return headings
|
||||
|
||||
def extract_references(file_path, headings):
|
||||
"""提取参考文献部分"""
|
||||
# 在标题中查找REFERENCE
|
||||
ref_heading = None
|
||||
for line_num, heading in headings:
|
||||
if "REFERENCE" in heading.upper():
|
||||
ref_heading = (line_num, heading)
|
||||
break
|
||||
|
||||
if not ref_heading:
|
||||
return None
|
||||
|
||||
ref_start = ref_heading[0] - 1 # 转换为0-based索引
|
||||
|
||||
# 查找下一个标题或文件结尾
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
ref_end = len(lines)
|
||||
for i in range(ref_start + 1, len(lines)):
|
||||
if re.match(r'^#', lines[i].strip()):
|
||||
ref_end = i
|
||||
break
|
||||
|
||||
# 提取参考文献内容
|
||||
references = lines[ref_start:ref_end]
|
||||
return ''.join(references)
|
||||
|
||||
def update_headings(file_path, heading_data):
|
||||
"""根据提供的标题数据更新Markdown文件中的标题"""
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
lines = file.readlines()
|
||||
|
||||
# 统计heading_data中level==1的数量
|
||||
count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
|
||||
flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3
|
||||
|
||||
for heading in heading_data:
|
||||
line_num = heading['line_num'] - 1
|
||||
if heading['level'] >= flag:
|
||||
lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"
|
||||
|
||||
with open(file_path, 'w', encoding='utf-8') as file:
|
||||
file.writelines(lines)
|
||||
|
||||
if __name__ == "__main__":
|
||||
file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"
|
||||
|
||||
# 提取并更新标题
|
||||
headings = extract_headings(file_path)
|
||||
title_info = [{"title": heading, "line_num": line_num, "level": "unknown"}
|
||||
for line_num, heading in headings]
|
||||
# result = get_true_level(title_info)
|
||||
# update_headings(file_path, result)
|
||||
|
||||
# 提取参考文献
|
||||
references = extract_references(file_path, headings)
|
||||
if references:
|
||||
print("提取的参考文献:")
|
||||
print(references)
|
||||
else:
|
||||
print("未找到参考文献部分")
|
||||
Reference in New Issue
Block a user