106 lines
3.4 KiB
Python
106 lines
3.4 KiB
Python
# Publication year range: 2015 to 2024
|
||
# Paper allocation per year: {2015: 80, 2016: 80, 2017: 80, 2018: 80, 2019: 80, 2020: 80, 2021: 80, 2022: 80, 2023: 80, 2024: 80}
|
||
# Selected 736 papers in total
|
||
# Saved selected papers to /home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json
|
||
# Selected papers per year:
|
||
# 2015: 80
|
||
# 2016: 80
|
||
# 2017: 80
|
||
# 2018: 80
|
||
# 2019: 80
|
||
# 2020: 80
|
||
# 2021: 80
|
||
# 2022: 80
|
||
# 2023: 80
|
||
# 2024: 16
|
||
# 挑选高引论文
|
||
import json
|
||
import os
|
||
from collections import defaultdict
|
||
import pandas as pd
|
||
from tqdm import tqdm
|
||
|
||
def process_json_file(input_file, output_file):
|
||
print(f"Reading JSON file: {input_file}")
|
||
|
||
# 读取JSON文件
|
||
with open(input_file, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
print(f"Loaded {len(data)} records from JSON array")
|
||
|
||
# 统计发表年份范围
|
||
min_year = float('inf')
|
||
max_year = float('-inf')
|
||
|
||
# 5-2024年每年的论文,按引用量排序
|
||
yearly_papers = defaultdict(list)
|
||
|
||
for paper in tqdm(data, desc="Processing papers"):
|
||
pub_year = paper.get("Publication Year")
|
||
if pub_year is None:
|
||
continue
|
||
|
||
try:
|
||
year = int(pub_year)
|
||
min_year = min(min_year, year)
|
||
max_year = max(max_year, year)
|
||
|
||
# 只关注2015-2024年间的论文
|
||
if 2015 <= year <= 2024:
|
||
# 获取引用量
|
||
citations = paper.get("Times Cited, All Databases")
|
||
if citations is None:
|
||
citations = paper.get("Times Cited, WoS Core")
|
||
|
||
yearly_papers[year].append((citations, paper))
|
||
|
||
except (ValueError, TypeError):
|
||
# 如果年份无法转换为整数,跳过
|
||
pass
|
||
print(f"Publication year range: {min_year} to {max_year}")
|
||
|
||
# 计算每年应该选择的论文数量
|
||
total_papers = 800
|
||
years = list(range(2015, 2025)) # 2015 到 2024
|
||
papers_per_year = total_papers // len(years)
|
||
remainder = total_papers % len(years)
|
||
|
||
# 分配每年的论文数量
|
||
allocation = {year: papers_per_year for year in years}
|
||
for year in years[:remainder]:
|
||
allocation[year] += 1
|
||
|
||
print(f"Paper allocation per year: {allocation}")
|
||
|
||
# 选择每年引用量最高的论文
|
||
selected_papers = []
|
||
for year in years:
|
||
# 按引用量排序
|
||
yearly_papers[year].sort(key=lambda x: x[0], reverse=True)
|
||
# 选择指定数量的论文
|
||
top_papers = yearly_papers[year][:allocation[year]]
|
||
selected_papers.extend([paper for _, paper in top_papers])
|
||
|
||
print(f"Selected {len(selected_papers)} papers in total")
|
||
|
||
# 保存到新的JSON文件
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(selected_papers, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"Saved selected papers to {output_file}")
|
||
|
||
# 输出每年选择的论文数量统计
|
||
selected_count = {year: 0 for year in years}
|
||
for paper in selected_papers:
|
||
year = int(paper["Publication Year"])
|
||
selected_count[year] += 1
|
||
|
||
print("Selected papers per year:")
|
||
for year in years:
|
||
print(f"{year}: {selected_count[year]}")
|
||
|
||
if __name__ == "__main__":
|
||
input_file = "/home/ubuntu/50T/fsy/paper-mat.json" # 替换为你的输入文件名
|
||
output_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json"
|
||
|
||
process_json_file(input_file, output_file) |