first update

This commit is contained in:
2025-01-17 23:23:07 +08:00
parent 6bb4fb5a00
commit 46cfd0296a
4 changed files with 375 additions and 0 deletions

118
reparagraph.py Executable file
View File

@@ -0,0 +1,118 @@
import re
import json
from openai import OpenAI
OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
MODEL_NAME = "deepseek-chat"
def get_true_level(title_info: list, max_retries: int = 5):
source_title = json.dumps(title_info)
instruction = """
有如下的JSON格式的标题信息,已知他们的标题内容和行号请你在level字段给出正确的层级关系层级关系用数字(1,2,3,4)表示,数字越小,层级越高。
额外的层级关系说明本层级关系要求存在多个1级标题而非仅一个1级标题。
<PLACEHOLDER>
返回结果的时候严格遵守下列示例JSON格式:
{ 'data': [
{ 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
...
]
"""
# 创建 OpenAI 客户端
client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
for attempt in range(max_retries):
try:
completion = client.chat.completions.create(
model=MODEL_NAME,
stream=False, # 关闭流模式
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
],
response_format={
'type': 'json_object'
}
)
response = completion.choices[0].message.content
response = json.loads(response)
return response['data']
except (json.JSONDecodeError, Exception) as e:
print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
if attempt == max_retries - 1:
return "Error"
def extract_headings(file_path):
"""提取markdown文件中所有以#开头的行及其行号"""
headings = []
with open(file_path, 'r', encoding='utf-8') as file:
for line_num, line in enumerate(file, 1):
if re.match(r'^#', line.strip()):
headings.append((line_num, line.strip()))
return headings
def extract_references(file_path, headings):
"""提取参考文献部分"""
# 在标题中查找REFERENCE
ref_heading = None
for line_num, heading in headings:
if "REFERENCE" in heading.upper():
ref_heading = (line_num, heading)
break
if not ref_heading:
return None
ref_start = ref_heading[0] - 1 # 转换为0-based索引
# 查找下一个标题或文件结尾
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
ref_end = len(lines)
for i in range(ref_start + 1, len(lines)):
if re.match(r'^#', lines[i].strip()):
ref_end = i
break
# 提取参考文献内容
references = lines[ref_start:ref_end]
return ''.join(references)
def update_headings(file_path, heading_data):
"""根据提供的标题数据更新Markdown文件中的标题"""
with open(file_path, 'r', encoding='utf-8') as file:
lines = file.readlines()
# 统计heading_data中level==1的数量
count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3
for heading in heading_data:
line_num = heading['line_num'] - 1
if heading['level'] >= flag:
lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"
with open(file_path, 'w', encoding='utf-8') as file:
file.writelines(lines)
if __name__ == "__main__":
file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"
# 提取并更新标题
headings = extract_headings(file_path)
title_info = [{"title": heading, "line_num": line_num, "level": "unknown"}
for line_num, heading in headings]
# result = get_true_level(title_info)
# update_headings(file_path, result)
# 提取参考文献
references = extract_references(file_path, headings)
if references:
print("提取的参考文献:")
print(references)
else:
print("未找到参考文献部分")