Merge branch 'main' of https://git.siat-mic.com/fsy/MatBench
This commit is contained in:
2
.gitignore
vendored
Normal file
2
.gitignore
vendored
Normal file
@@ -0,0 +1,2 @@
|
||||
MatBench/layer3/articles_fsy
|
||||
*.zip
|
||||
99
layer3/IsMaterialSci.py
Normal file
99
layer3/IsMaterialSci.py
Normal file
@@ -0,0 +1,99 @@
|
||||
# 筛除综述类论文
|
||||
import json
|
||||
import time
|
||||
import threading
|
||||
import queue
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
from openai import OpenAI
|
||||
|
||||
result_lock = threading.Lock()
|
||||
api_semaphore = threading.Semaphore(5)
|
||||
material_items = []
|
||||
error_items = []
|
||||
|
||||
client = OpenAI(
|
||||
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
|
||||
base_url="https://vip.apiyi.com/v1"
|
||||
)
|
||||
|
||||
def load_qa_data(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
def classify_qa_type(abstract,title):
|
||||
prompt = f"""
|
||||
This is a categorization task. Please analyze the title and abstract of the article entered to determine if it is a review paper or report. Strictly return the number 1 if the title and abstract meet the requirements for a review paper or report, or 0 if they do not meet the requirements for a review paper or report. do not provide any other explanation or output, just return the number 1 or 0.
|
||||
|
||||
Article Title:
|
||||
{title}
|
||||
|
||||
Abstract:
|
||||
{abstract}
|
||||
"""
|
||||
|
||||
with api_semaphore:
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model="deepseek-chat",
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful assistant"},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
stream=False
|
||||
)
|
||||
result = response.choices[0].message.content.strip().lower()
|
||||
print(result)
|
||||
return result
|
||||
except Exception as e:
|
||||
print(f"API调用错误: {e}")
|
||||
return "2"
|
||||
|
||||
def process_item(item, index, total):
|
||||
print(f"处理第 {index+1}/{total} 条数据...")
|
||||
abstract = item["Abstract"]
|
||||
# choices = item["choices"]["text"]
|
||||
# choices = item["distractor3"] +','+ item["distractor2"] + ',' + item["distractor1"] +','+item["correct_answer"]
|
||||
title = item["Article Title"]
|
||||
label = classify_qa_type(abstract,title)
|
||||
|
||||
with result_lock:
|
||||
if "0" in label:
|
||||
material_items.append(item)
|
||||
elif "2" in label:
|
||||
item["error"] = "yes"
|
||||
error_items.append(item)
|
||||
|
||||
def save_processed_data(data, output_file):
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
def main():
|
||||
input_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
|
||||
output_file = "/home/ubuntu/50T/fsy/top_cited_paper_mat.json"
|
||||
error_file = "/home/ubuntu/50T/fsy/paper-error.json"
|
||||
|
||||
data = load_qa_data(input_file)
|
||||
total = len(data)
|
||||
|
||||
with ThreadPoolExecutor(max_workers=10) as executor:
|
||||
futures = []
|
||||
for i, item in enumerate(data):
|
||||
future = executor.submit(process_item, item, i, total)
|
||||
futures.append(future)
|
||||
|
||||
if (i+1) % 10 == 0:
|
||||
time.sleep(1)
|
||||
|
||||
for future in futures:
|
||||
future.result()
|
||||
|
||||
save_processed_data(material_items, output_file)
|
||||
print(f"处理完成,材料科学相关条目已保存到 {output_file}")
|
||||
|
||||
if error_items:
|
||||
save_processed_data(error_items, error_file)
|
||||
print(f"处理出错的条目已保存到 {error_file}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
547824
layer3/paper-mat.json
Normal file
547824
layer3/paper-mat.json
Normal file
File diff suppressed because it is too large
Load Diff
106
layer3/paper-mat.py
Normal file
106
layer3/paper-mat.py
Normal file
@@ -0,0 +1,106 @@
|
||||
# Publication year range: 2015 to 2024
|
||||
# Paper allocation per year: {2015: 80, 2016: 80, 2017: 80, 2018: 80, 2019: 80, 2020: 80, 2021: 80, 2022: 80, 2023: 80, 2024: 80}
|
||||
# Selected 736 papers in total
|
||||
# Saved selected papers to /home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json
|
||||
# Selected papers per year:
|
||||
# 2015: 80
|
||||
# 2016: 80
|
||||
# 2017: 80
|
||||
# 2018: 80
|
||||
# 2019: 80
|
||||
# 2020: 80
|
||||
# 2021: 80
|
||||
# 2022: 80
|
||||
# 2023: 80
|
||||
# 2024: 16
|
||||
# 挑选高引论文
|
||||
import json
|
||||
import os
|
||||
from collections import defaultdict
|
||||
import pandas as pd
|
||||
from tqdm import tqdm
|
||||
|
||||
def process_json_file(input_file, output_file):
|
||||
print(f"Reading JSON file: {input_file}")
|
||||
|
||||
# 读取JSON文件
|
||||
with open(input_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
print(f"Loaded {len(data)} records from JSON array")
|
||||
|
||||
# 统计发表年份范围
|
||||
min_year = float('inf')
|
||||
max_year = float('-inf')
|
||||
|
||||
# 5-2024年每年的论文,按引用量排序
|
||||
yearly_papers = defaultdict(list)
|
||||
|
||||
for paper in tqdm(data, desc="Processing papers"):
|
||||
pub_year = paper.get("Publication Year")
|
||||
if pub_year is None:
|
||||
continue
|
||||
|
||||
try:
|
||||
year = int(pub_year)
|
||||
min_year = min(min_year, year)
|
||||
max_year = max(max_year, year)
|
||||
|
||||
# 只关注2015-2024年间的论文
|
||||
if 2015 <= year <= 2024:
|
||||
# 获取引用量
|
||||
citations = paper.get("Times Cited, All Databases")
|
||||
if citations is None:
|
||||
citations = paper.get("Times Cited, WoS Core")
|
||||
|
||||
yearly_papers[year].append((citations, paper))
|
||||
|
||||
except (ValueError, TypeError):
|
||||
# 如果年份无法转换为整数,跳过
|
||||
pass
|
||||
print(f"Publication year range: {min_year} to {max_year}")
|
||||
|
||||
# 计算每年应该选择的论文数量
|
||||
total_papers = 800
|
||||
years = list(range(2015, 2025)) # 2015 到 2024
|
||||
papers_per_year = total_papers // len(years)
|
||||
remainder = total_papers % len(years)
|
||||
|
||||
# 分配每年的论文数量
|
||||
allocation = {year: papers_per_year for year in years}
|
||||
for year in years[:remainder]:
|
||||
allocation[year] += 1
|
||||
|
||||
print(f"Paper allocation per year: {allocation}")
|
||||
|
||||
# 选择每年引用量最高的论文
|
||||
selected_papers = []
|
||||
for year in years:
|
||||
# 按引用量排序
|
||||
yearly_papers[year].sort(key=lambda x: x[0], reverse=True)
|
||||
# 选择指定数量的论文
|
||||
top_papers = yearly_papers[year][:allocation[year]]
|
||||
selected_papers.extend([paper for _, paper in top_papers])
|
||||
|
||||
print(f"Selected {len(selected_papers)} papers in total")
|
||||
|
||||
# 保存到新的JSON文件
|
||||
with open(output_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(selected_papers, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"Saved selected papers to {output_file}")
|
||||
|
||||
# 输出每年选择的论文数量统计
|
||||
selected_count = {year: 0 for year in years}
|
||||
for paper in selected_papers:
|
||||
year = int(paper["Publication Year"])
|
||||
selected_count[year] += 1
|
||||
|
||||
print("Selected papers per year:")
|
||||
for year in years:
|
||||
print(f"{year}: {selected_count[year]}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
input_file = "/home/ubuntu/50T/fsy/paper-mat.json" # 替换为你的输入文件名
|
||||
output_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
|
||||
|
||||
process_json_file(input_file, output_file)
|
||||
BIN
layer3/paper.xls
Normal file
BIN
layer3/paper.xls
Normal file
Binary file not shown.
43736
layer3/top_cited_paper_mat_nooverview.json
Normal file
43736
layer3/top_cited_paper_mat_nooverview.json
Normal file
File diff suppressed because it is too large
Load Diff
54466
layer3/top_cited_papers_2015_2024.json
Normal file
54466
layer3/top_cited_papers_2015_2024.json
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user