first update

2025-01-17 23:23:07 +08:00
parent 6bb4fb5a00
commit 46cfd0296a
4 changed files with 375 additions and 0 deletions
--- a/reparagraph.py
+++ b/reparagraph.py
@@ -0,0 +1,118 @@
+import re
+import json
+from openai import OpenAI
+
+
+OPENAI_BASE_URL = "http://8.218.238.241:17935/v1"
+OPENAI_API_KEY = "sk-urFGAQRThR6pysea0aC93bD27fA34bA69811A9254aAaD8B2"
+MODEL_NAME = "deepseek-chat"
+
+def get_true_level(title_info: list, max_retries: int = 5):
+    source_title = json.dumps(title_info)
+    instruction = """
+    有如下的JSON格式的标题信息,已知他们的标题内容和行号，请你在level字段给出正确的层级关系，层级关系用数字(1,2,3,4)表示，数字越小，层级越高。
+    额外的层级关系说明：本层级关系要求存在多个1级标题而非仅一个1级标题。
+    <PLACEHOLDER>
+    返回结果的时候严格遵守下列示例JSON格式:
+    { 'data': [
+        { 'title': '# A hierarchically porous MOF confined CsPbBr3 quantum dots: Fluorescence switching probe for detecting Cu (II) and melamine in food samples', 'line_num': 1, 'level': 1},
+        ...
+    ]
+    """
+    # 创建 OpenAI 客户端
+    client = OpenAI(api_key=OPENAI_API_KEY, base_url=OPENAI_BASE_URL)
+    for attempt in range(max_retries):
+        try:
+            completion = client.chat.completions.create(
+                model=MODEL_NAME,
+                stream=False,  # 关闭流模式
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant."},
+                    {"role": "user", "content": instruction.replace("<PLACEHOLDER>", source_title)}
+                ],
+                response_format={
+                'type': 'json_object'
+                }
+            )    
+
+            response = completion.choices[0].message.content
+            response = json.loads(response)
+            return response['data']
+        
+        except (json.JSONDecodeError, Exception) as e:
+            print(f"尝试 {attempt + 1}/{max_retries} 失败: {str(e)}")
+            if attempt == max_retries - 1:
+                return "Error"
+
+
+def extract_headings(file_path):
+    """提取markdown文件中所有以#开头的行及其行号"""
+    headings = []
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line_num, line in enumerate(file, 1):
+            if re.match(r'^#', line.strip()):
+                headings.append((line_num, line.strip()))
+    return headings
+
+def extract_references(file_path, headings):
+    """提取参考文献部分"""
+    # 在标题中查找REFERENCE
+    ref_heading = None
+    for line_num, heading in headings:
+        if "REFERENCE" in heading.upper():
+            ref_heading = (line_num, heading)
+            break
+    
+    if not ref_heading:
+        return None
+    
+    ref_start = ref_heading[0] - 1  # 转换为0-based索引
+    
+    # 查找下一个标题或文件结尾
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    
+    ref_end = len(lines)
+    for i in range(ref_start + 1, len(lines)):
+        if re.match(r'^#', lines[i].strip()):
+            ref_end = i
+            break
+    
+    # 提取参考文献内容
+    references = lines[ref_start:ref_end]
+    return ''.join(references)
+
+def update_headings(file_path, heading_data):
+    """根据提供的标题数据更新Markdown文件中的标题"""
+    with open(file_path, 'r', encoding='utf-8') as file:
+        lines = file.readlines()
+    
+    # 统计heading_data中level==1的数量
+    count_level_1 = sum(1 for item in heading_data if item['level'] == 1)
+    flag = 3 if count_level_1 > 1 else 4 # 存在多个一级标题是为2否则为3
+
+    for heading in heading_data:
+        line_num = heading['line_num'] - 1 
+        if heading['level'] >= flag:
+            lines[line_num] = "**" + lines[line_num].replace("#", "").strip() + "**\n"
+    
+    with open(file_path, 'w', encoding='utf-8') as file:
+        file.writelines(lines)
+
+if __name__ == "__main__":
+    file_path = "/root/data50T/LYT/matagent/A hierarchically porous MOF confined CsPbBr3 quantum dots- Fluorescence switching probe for detecting Cu (II) and melamine in food samples.md"
+    
+    # 提取并更新标题
+    headings = extract_headings(file_path)
+    title_info = [{"title": heading, "line_num": line_num, "level": "unknown"} 
+                 for line_num, heading in headings]
+    # result = get_true_level(title_info)
+    # update_headings(file_path, result)
+    
+    # 提取参考文献
+    references = extract_references(file_path, headings)
+    if references:
+        print("提取的参考文献：")
+        print(references)
+    else:
+        print("未找到参考文献部分")