import re import json from openai import OpenAI OPENAI_BASE_URL = "http://8.218.238.241:17935/v1" OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2" MODEL_NAME = "deepseek-chat" def get_true_level(title_info: list, max_retries: int = 5): source_title = json.dumps(title_info) instruction = """ 有如下的JSON格式的标题信息,已知他们的标题内容和行号,请你在level字段给出正确的层级关系,层级关系用数字(1,2,3,4)表示,数字越小,层级越高。 额外的层级关系说明:本层级关系要求存在多个1级标题而非仅一个1级标题。 返回结果的时候严格遵守下列示例JSON格式: { 'data': [ { 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1}, ... ] """ # 创建 OpenAI 客户端 client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL) for attempt in range(max_retries): try: completion = client.chat.completions.create( model=MODEL_NAME, stream=False, # 关闭流模式 messages=[ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": instruction.replace("", source_title)} ], response_format={ 'type': 'json_object' } ) response = completion.choices[0].message.content response = json.loads(response) return response['data'] except (json.JSONDecodeError, Exception) as e: print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}") if attempt == max_retries - 1: return "Error" def extract_headings(file_path): """提取markdown文件中所有以#开头的行及其行号""" headings = [] with open(file_path, 'r', encoding='utf-8') as file: for line_num, line in enumerate(file, 1): if re.match(r'^#', line.strip()): headings.append((line_num, line.strip())) return headings def extract_references(file_path, headings): """提取参考文献部分""" # 在标题中查找REFERENCE ref_heading = None for line_num, heading in headings: if "REFERENCE" in heading.upper(): ref_heading = (line_num, heading) break if not ref_heading: return None ref_start = ref_heading[0] - 1 # 转换为0-based索引 # 查找下一个标题或文件结尾 with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() ref_end = len(lines) for i in range(ref_start + 1, len(lines)): if re.match(r'^#', lines[i].strip()): ref_end = i break # 提取参考文献内容 references = lines[ref_start:ref_end] return ''.join(references) def update_headings(file_path, heading_data): """根据提供的标题数据更新Markdown文件中的标题""" with open(file_path, 'r', encoding='utf-8') as file: lines = file.readlines() # 统计heading_data中level==1的数量 count_level_1 = sum(1 for item in heading_data if item['level'] == 1) flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3 for heading in heading_data: line_num = heading['line_num'] - 1 if heading['level'] >= flag: lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n" with open(file_path, 'w', encoding='utf-8') as file: file.writelines(lines) if __name__ == "__main__": file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md" # 提取并更新标题 headings = extract_headings(file_path) title_info = [{"title": heading, "line_num": line_num, "level": "unknown"} for line_num, heading in headings] # result = get_true_level(title_info) # update_headings(file_path, result) # 提取参考文献 references = extract_references(file_path, headings) if references: print("提取的参考文献:") print(references) else: print("未找到参考文献部分")