Files
MatBench/layer3/paper-mat.py
2025-05-28 15:33:18 +08:00

106 lines
3.4 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# Publication year range: 2015 to 2024
# Paper allocation per year: {2015: 80, 2016: 80, 2017: 80, 2018: 80, 2019: 80, 2020: 80, 2021: 80, 2022: 80, 2023: 80, 2024: 80}
# Selected 736 papers in total
# Saved selected papers to /home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json
# Selected papers per year:
# 2015: 80
# 2016: 80
# 2017: 80
# 2018: 80
# 2019: 80
# 2020: 80
# 2021: 80
# 2022: 80
# 2023: 80
# 2024: 16
# 挑选高引论文
import json
import os
from collections import defaultdict
import pandas as pd
from tqdm import tqdm
def process_json_file(input_file, output_file):
print(f"Reading JSON file: {input_file}")
# 读取JSON文件
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
print(f"Loaded {len(data)} records from JSON array")
# 统计发表年份范围
min_year = float('inf')
max_year = float('-inf')
# 5-2024年每年的论文按引用量排序
yearly_papers = defaultdict(list)
for paper in tqdm(data, desc="Processing papers"):
pub_year = paper.get("Publication Year")
if pub_year is None:
continue
try:
year = int(pub_year)
min_year = min(min_year, year)
max_year = max(max_year, year)
# 只关注2015-2024年间的论文
if 2015 <= year <= 2024:
# 获取引用量
citations = paper.get("Times Cited, All Databases")
if citations is None:
citations = paper.get("Times Cited, WoS Core")
yearly_papers[year].append((citations, paper))
except (ValueError, TypeError):
# 如果年份无法转换为整数,跳过
pass
print(f"Publication year range: {min_year} to {max_year}")
# 计算每年应该选择的论文数量
total_papers = 800
years = list(range(2015, 2025)) # 2015 到 2024
papers_per_year = total_papers // len(years)
remainder = total_papers % len(years)
# 分配每年的论文数量
allocation = {year: papers_per_year for year in years}
for year in years[:remainder]:
allocation[year] += 1
print(f"Paper allocation per year: {allocation}")
# 选择每年引用量最高的论文
selected_papers = []
for year in years:
# 按引用量排序
yearly_papers[year].sort(key=lambda x: x[0], reverse=True)
# 选择指定数量的论文
top_papers = yearly_papers[year][:allocation[year]]
selected_papers.extend([paper for _, paper in top_papers])
print(f"Selected {len(selected_papers)} papers in total")
# 保存到新的JSON文件
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(selected_papers, f, ensure_ascii=False, indent=2)
print(f"Saved selected papers to {output_file}")
# 输出每年选择的论文数量统计
selected_count = {year: 0 for year in years}
for paper in selected_papers:
year = int(paper["Publication Year"])
selected_count[year] += 1
print("Selected papers per year:")
for year in years:
print(f"{year}: {selected_count[year]}")
if __name__ == "__main__":
input_file = "/home/ubuntu/50T/fsy/paper-mat.json" # 替换为你的输入文件名
output_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
process_json_file(input_file, output_file)