从原始JSON中清理出下载成功的文献,并以WOS JSON为基础清洗了small json作为后续的基础
This commit is contained in:
44538
layer3/data/raw_data.json
Normal file
44538
layer3/data/raw_data.json
Normal file
File diff suppressed because one or more lines are too long
9378
layer3/data/raw_small_data.json
Normal file
9378
layer3/data/raw_small_data.json
Normal file
File diff suppressed because one or more lines are too long
58
layer3/src/getdownloadedjson.py
Normal file
58
layer3/src/getdownloadedjson.py
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
# 读/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds下所有子文件夹的文件夹名
|
||||
def get_sub_folder_name_set():
|
||||
path = '/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds'
|
||||
sub_folder_name = []
|
||||
|
||||
# 遍历目录下的所有子文件夹
|
||||
for folder in os.listdir(path):
|
||||
sub_folder_name.append(folder)
|
||||
|
||||
return set(sub_folder_name)
|
||||
|
||||
def read_markdown_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
|
||||
def get_downloaded_json(sub_folder_name_set, abs_path='/home/ubuntu/50T/LYT/MatBench/layer3/articles'):
|
||||
# 获取已下载的json文件名
|
||||
downloaded_json = []
|
||||
path = '/home/ubuntu/50T/LYT/MatBench/layer3/data/top_cited_paper_mat_nooverview.json'
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for item in data:
|
||||
# 提取文件名
|
||||
doi = item['DOI'].replace('/', '_')
|
||||
new_item = {}
|
||||
if doi in sub_folder_name_set:
|
||||
new_item['id'] = doi
|
||||
new_item['DOI'] = item['DOI']
|
||||
new_item['DOI Link'] = item['DOI Link']
|
||||
new_item['Relative Dir Path'] = f'mds/{doi}'
|
||||
new_item['Article Title'] = item['Article Title']
|
||||
new_item['Authors'] = item['Authors']
|
||||
new_item['Source Title'] = item['Source Title']
|
||||
new_item['Abstract'] = item['Abstract']
|
||||
new_item['Times Cited, WoS Core'] = item['Times Cited, WoS Core']
|
||||
new_item['Times Cited, All Databases'] = item['Times Cited, All Databases']
|
||||
new_item['Publication Year'] = item['Publication Year']
|
||||
new_item['Research Areas'] = item['Research Areas']
|
||||
new_item['UT (Unique WOS ID)'] = item['UT (Unique WOS ID)']
|
||||
new_item['Markdown'] = read_markdown_file(os.path.join(abs_path, 'mds', doi, f'{doi}.md'))
|
||||
downloaded_json.append(new_item)
|
||||
return downloaded_json
|
||||
|
||||
def write_json_to_file(data, file_path):
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
sub_folder_name_set = get_sub_folder_name_set()
|
||||
print(len(sub_folder_name_set))
|
||||
downloaded_json = get_downloaded_json(sub_folder_name_set)
|
||||
print(len(downloaded_json))
|
||||
write_json_to_file(downloaded_json, '/home/ubuntu/50T/LYT/MatBench/layer3/data/raw_small_data.json')
|
||||
Reference in New Issue
Block a user