Merge branch 'main' of https://git.siat-mic.com/fsy/MatBench

2025-05-28 15:57:05 +08:00
parent 9abd8fc1c5 472c183c5b
commit 4959227855
7 changed files with 646233 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,2 @@
+MatBench/layer3/articles_fsy
+*.zip
--- a/layer3/IsMaterialSci.py
+++ b/layer3/IsMaterialSci.py
@@ -0,0 +1,99 @@
+# 筛除综述类论文
+import json
+import time
+import threading
+import queue
+from concurrent.futures import ThreadPoolExecutor
+from openai import OpenAI
+
+result_lock = threading.Lock()
+api_semaphore = threading.Semaphore(5)  
+material_items = []  
+error_items = [] 
+
+client = OpenAI(
+    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
+    base_url="https://vip.apiyi.com/v1"
+)
+
+def load_qa_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def classify_qa_type(abstract,title):
+    prompt = f"""
+            This is a categorization task. Please analyze the title and abstract of the article entered to determine if it is a review paper or report. Strictly return the number 1 if the title and abstract meet the requirements for a review paper or report, or 0 if they do not meet the requirements for a review paper or report. do not provide any other explanation or output, just return the number 1 or 0.
+            
+            Article Title:
+            {title}
+            
+            Abstract:
+            {abstract}
+            """
+    
+    with api_semaphore:
+        try:
+            response = client.chat.completions.create(
+                model="deepseek-chat",
+                messages=[
+                    {"role": "system", "content": "You are a helpful assistant"},
+                    {"role": "user", "content": prompt}
+                ],
+                stream=False
+            )
+            result = response.choices[0].message.content.strip().lower()
+            print(result)
+            return result
+        except Exception as e:
+            print(f"API调用错误: {e}")
+            return "2" 
+
+def process_item(item, index, total):
+    print(f"处理第 {index+1}/{total} 条数据...")
+    abstract = item["Abstract"]
+    # choices = item["choices"]["text"]
+    # choices = item["distractor3"] +','+ item["distractor2"] + ',' + item["distractor1"] +','+item["correct_answer"]
+    title = item["Article Title"]
+    label = classify_qa_type(abstract,title)
+    
+    with result_lock:
+        if "0" in label:
+            material_items.append(item)
+        elif "2" in label:
+            item["error"] = "yes"
+            error_items.append(item)
+        
+def save_processed_data(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+def main():
+    input_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
+    output_file = "/home/ubuntu/50T/fsy/top_cited_paper_mat.json"
+    error_file = "/home/ubuntu/50T/fsy/paper-error.json"
+    
+    data = load_qa_data(input_file)
+    total = len(data)
+    
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for i, item in enumerate(data):
+            future = executor.submit(process_item, item, i, total)
+            futures.append(future)
+            
+            if (i+1) % 10 == 0:
+                time.sleep(1)
+    
+        for future in futures:
+            future.result()
+    
+    save_processed_data(material_items, output_file)
+    print(f"处理完成，材料科学相关条目已保存到 {output_file}")
+    
+    if error_items:
+        save_processed_data(error_items, error_file)
+        print(f"处理出错的条目已保存到 {error_file}")
+
+if __name__ == "__main__":
+    main()
--- a/layer3/paper-mat.json
+++ b/layer3/paper-mat.json
--- a/layer3/paper-mat.py
+++ b/layer3/paper-mat.py
@@ -0,0 +1,106 @@
+# Publication year range: 2015 to 2024
+# Paper allocation per year: {2015: 80, 2016: 80, 2017: 80, 2018: 80, 2019: 80, 2020: 80, 2021: 80, 2022: 80, 2023: 80, 2024: 80}
+# Selected 736 papers in total
+# Saved selected papers to /home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json
+# Selected papers per year:
+# 2015: 80
+# 2016: 80
+# 2017: 80
+# 2018: 80
+# 2019: 80
+# 2020: 80
+# 2021: 80
+# 2022: 80
+# 2023: 80
+# 2024: 16
+# 挑选高引论文
+import json
+import os
+from collections import defaultdict
+import pandas as pd
+from tqdm import tqdm
+
+def process_json_file(input_file, output_file):
+    print(f"Reading JSON file: {input_file}")
+    
+    # 读取JSON文件
+    with open(input_file, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+        print(f"Loaded {len(data)} records from JSON array")
+    
+    # 统计发表年份范围
+    min_year = float('inf')
+    max_year = float('-inf')
+    
+    # 5-2024年每年的论文，按引用量排序
+    yearly_papers = defaultdict(list)
+    
+    for paper in tqdm(data, desc="Processing papers"):
+        pub_year = paper.get("Publication Year")
+        if pub_year is None:
+            continue
+        
+        try:
+            year = int(pub_year)
+            min_year = min(min_year, year)
+            max_year = max(max_year, year)
+            
+            # 只关注2015-2024年间的论文
+            if 2015 <= year <= 2024:
+                # 获取引用量
+                citations = paper.get("Times Cited, All Databases")
+                if citations is None:
+                    citations = paper.get("Times Cited, WoS Core")
+            
+                yearly_papers[year].append((citations, paper))
+
+        except (ValueError, TypeError):
+            # 如果年份无法转换为整数，跳过
+            pass
+    print(f"Publication year range: {min_year} to {max_year}")
+    
+    # 计算每年应该选择的论文数量
+    total_papers = 800
+    years = list(range(2015, 2025))  # 2015 到 2024
+    papers_per_year = total_papers // len(years)
+    remainder = total_papers % len(years)
+    
+    # 分配每年的论文数量
+    allocation = {year: papers_per_year for year in years}
+    for year in years[:remainder]:
+        allocation[year] += 1
+    
+    print(f"Paper allocation per year: {allocation}")
+    
+    # 选择每年引用量最高的论文
+    selected_papers = []
+    for year in years:
+        # 按引用量排序
+        yearly_papers[year].sort(key=lambda x: x[0], reverse=True)
+        # 选择指定数量的论文
+        top_papers = yearly_papers[year][:allocation[year]]
+        selected_papers.extend([paper for _, paper in top_papers])
+    
+    print(f"Selected {len(selected_papers)} papers in total")
+    
+    # 保存到新的JSON文件
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(selected_papers, f, ensure_ascii=False, indent=2)
+    
+    print(f"Saved selected papers to {output_file}")
+    
+    # 输出每年选择的论文数量统计
+    selected_count = {year: 0 for year in years}
+    for paper in selected_papers:
+        year = int(paper["Publication Year"])
+        selected_count[year] += 1
+    
+    print("Selected papers per year:")
+    for year in years:
+        print(f"{year}: {selected_count[year]}")
+
+if __name__ == "__main__":
+    input_file = "/home/ubuntu/50T/fsy/paper-mat.json"  # 替换为你的输入文件名
+    output_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
+    
+    process_json_file(input_file, output_file)
--- a/layer3/paper.xls
+++ b/layer3/paper.xls
--- a/layer3/top_cited_paper_mat_nooverview.json
+++ b/layer3/top_cited_paper_mat_nooverview.json
--- a/layer3/top_cited_papers_2015_2024.json
+++ b/layer3/top_cited_papers_2015_2024.json