# Publication year range: 2015 to 2024 # Paper allocation per year: {2015: 80, 2016: 80, 2017: 80, 2018: 80, 2019: 80, 2020: 80, 2021: 80, 2022: 80, 2023: 80, 2024: 80} # Selected 736 papers in total # Saved selected papers to /home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json # Selected papers per year: # 2015: 80 # 2016: 80 # 2017: 80 # 2018: 80 # 2019: 80 # 2020: 80 # 2021: 80 # 2022: 80 # 2023: 80 # 2024: 16 # 挑选高引论文 import json import os from collections import defaultdict import pandas as pd from tqdm import tqdm def process_json_file(input_file, output_file): print(f"Reading JSON file: {input_file}") # 读取JSON文件 with open(input_file, 'r', encoding='utf-8') as f: data = json.load(f) print(f"Loaded {len(data)} records from JSON array") # 统计发表年份范围 min_year = float('inf') max_year = float('-inf') # 5-2024年每年的论文,按引用量排序 yearly_papers = defaultdict(list) for paper in tqdm(data, desc="Processing papers"): pub_year = paper.get("Publication Year") if pub_year is None: continue try: year = int(pub_year) min_year = min(min_year, year) max_year = max(max_year, year) # 只关注2015-2024年间的论文 if 2015 <= year <= 2024: # 获取引用量 citations = paper.get("Times Cited, All Databases") if citations is None: citations = paper.get("Times Cited, WoS Core") yearly_papers[year].append((citations, paper)) except (ValueError, TypeError): # 如果年份无法转换为整数,跳过 pass print(f"Publication year range: {min_year} to {max_year}") # 计算每年应该选择的论文数量 total_papers = 800 years = list(range(2015, 2025)) # 2015 到 2024 papers_per_year = total_papers // len(years) remainder = total_papers % len(years) # 分配每年的论文数量 allocation = {year: papers_per_year for year in years} for year in years[:remainder]: allocation[year] += 1 print(f"Paper allocation per year: {allocation}") # 选择每年引用量最高的论文 selected_papers = [] for year in years: # 按引用量排序 yearly_papers[year].sort(key=lambda x: x[0], reverse=True) # 选择指定数量的论文 top_papers = yearly_papers[year][:allocation[year]] selected_papers.extend([paper for _, paper in top_papers]) print(f"Selected {len(selected_papers)} papers in total") # 保存到新的JSON文件 with open(output_file, 'w', encoding='utf-8') as f: json.dump(selected_papers, f, ensure_ascii=False, indent=2) print(f"Saved selected papers to {output_file}") # 输出每年选择的论文数量统计 selected_count = {year: 0 for year in years} for paper in selected_papers: year = int(paper["Publication Year"]) selected_count[year] += 1 print("Selected papers per year:") for year in years: print(f"{year}: {selected_count[year]}") if __name__ == "__main__": input_file = "/home/ubuntu/50T/fsy/paper-mat.json" # 替换为你的输入文件名 output_file = "/home/ubuntu/50T/fsy/top_cited_papers_2015_2024.json" process_json_file(input_file, output_file)