从原始JSON中清理出下载成功的文献,并以WOS JSON为基础清洗了small json作为后续的基础

This commit is contained in:
lzy
2025-06-12 20:42:16 +08:00
parent 65248c1e04
commit 38dce859dd
6 changed files with 53974 additions and 0 deletions

44538
layer3/data/raw_data.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,58 @@
import os
import json
import re
# 读/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds下所有子文件夹的文件夹名
def get_sub_folder_name_set():
path = '/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds'
sub_folder_name = []
# 遍历目录下的所有子文件夹
for folder in os.listdir(path):
sub_folder_name.append(folder)
return set(sub_folder_name)
def read_markdown_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def get_downloaded_json(sub_folder_name_set, abs_path='/home/ubuntu/50T/LYT/MatBench/layer3/articles'):
# 获取已下载的json文件名
downloaded_json = []
path = '/home/ubuntu/50T/LYT/MatBench/layer3/data/top_cited_paper_mat_nooverview.json'
with open(path, 'r') as f:
data = json.load(f)
for item in data:
# 提取文件名
doi = item['DOI'].replace('/', '_')
new_item = {}
if doi in sub_folder_name_set:
new_item['id'] = doi
new_item['DOI'] = item['DOI']
new_item['DOI Link'] = item['DOI Link']
new_item['Relative Dir Path'] = f'mds/{doi}'
new_item['Article Title'] = item['Article Title']
new_item['Authors'] = item['Authors']
new_item['Source Title'] = item['Source Title']
new_item['Abstract'] = item['Abstract']
new_item['Times Cited, WoS Core'] = item['Times Cited, WoS Core']
new_item['Times Cited, All Databases'] = item['Times Cited, All Databases']
new_item['Publication Year'] = item['Publication Year']
new_item['Research Areas'] = item['Research Areas']
new_item['UT (Unique WOS ID)'] = item['UT (Unique WOS ID)']
new_item['Markdown'] = read_markdown_file(os.path.join(abs_path, 'mds', doi, f'{doi}.md'))
downloaded_json.append(new_item)
return downloaded_json
def write_json_to_file(data, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
sub_folder_name_set = get_sub_folder_name_set()
print(len(sub_folder_name_set))
downloaded_json = get_downloaded_json(sub_folder_name_set)
print(len(downloaded_json))
write_json_to_file(downloaded_json, '/home/ubuntu/50T/LYT/MatBench/layer3/data/raw_small_data.json')