5
.gitignore
vendored
5
.gitignore
vendored
@@ -1,4 +1,7 @@
|
||||
MatBench/layer3/articles_fsy
|
||||
*.zip
|
||||
*.temp
|
||||
*.pyc
|
||||
*.pyc
|
||||
*.pdf
|
||||
*.md
|
||||
layer3/articles/*
|
||||
@@ -518,7 +518,7 @@ def main():
|
||||
# 难度选择比例配置
|
||||
SELECTION_RATIOS = {
|
||||
"hard_early_stop": 1.0, # 困难题选择10%
|
||||
"easy_all_correct": 0.0, # 简单题选择3.5%
|
||||
"easy_all_correct": 0.0, # 简单题选择3.5%
|
||||
"mixed": 0.0, # 混合题选择0%
|
||||
"unknown": 0.0 # 未知难度不选择
|
||||
}
|
||||
|
||||
12286
layer3/data/downloaded_failed_papers.json
Normal file
12286
layer3/data/downloaded_failed_papers.json
Normal file
File diff suppressed because it is too large
Load Diff
44538
layer3/data/raw_data.json
Normal file
44538
layer3/data/raw_data.json
Normal file
File diff suppressed because one or more lines are too long
9378
layer3/data/raw_small_data.json
Normal file
9378
layer3/data/raw_small_data.json
Normal file
File diff suppressed because one or more lines are too long
547824
layer3/paper-mat.json
547824
layer3/paper-mat.json
File diff suppressed because it is too large
Load Diff
58
layer3/src/getdownloadedjson.py
Normal file
58
layer3/src/getdownloadedjson.py
Normal file
@@ -0,0 +1,58 @@
|
||||
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
|
||||
# 读/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds下所有子文件夹的文件夹名
|
||||
def get_sub_folder_name_set():
|
||||
path = '/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds'
|
||||
sub_folder_name = []
|
||||
|
||||
# 遍历目录下的所有子文件夹
|
||||
for folder in os.listdir(path):
|
||||
sub_folder_name.append(folder)
|
||||
|
||||
return set(sub_folder_name)
|
||||
|
||||
def read_markdown_file(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
content = file.read()
|
||||
return content
|
||||
|
||||
def get_downloaded_json(sub_folder_name_set, abs_path='/home/ubuntu/50T/LYT/MatBench/layer3/articles'):
|
||||
# 获取已下载的json文件名
|
||||
downloaded_json = []
|
||||
path = '/home/ubuntu/50T/LYT/MatBench/layer3/data/top_cited_paper_mat_nooverview.json'
|
||||
with open(path, 'r') as f:
|
||||
data = json.load(f)
|
||||
for item in data:
|
||||
# 提取文件名
|
||||
doi = item['DOI'].replace('/', '_')
|
||||
new_item = {}
|
||||
if doi in sub_folder_name_set:
|
||||
new_item['id'] = doi
|
||||
new_item['DOI'] = item['DOI']
|
||||
new_item['DOI Link'] = item['DOI Link']
|
||||
new_item['Relative Dir Path'] = f'mds/{doi}'
|
||||
new_item['Article Title'] = item['Article Title']
|
||||
new_item['Authors'] = item['Authors']
|
||||
new_item['Source Title'] = item['Source Title']
|
||||
new_item['Abstract'] = item['Abstract']
|
||||
new_item['Times Cited, WoS Core'] = item['Times Cited, WoS Core']
|
||||
new_item['Times Cited, All Databases'] = item['Times Cited, All Databases']
|
||||
new_item['Publication Year'] = item['Publication Year']
|
||||
new_item['Research Areas'] = item['Research Areas']
|
||||
new_item['UT (Unique WOS ID)'] = item['UT (Unique WOS ID)']
|
||||
new_item['Markdown'] = read_markdown_file(os.path.join(abs_path, 'mds', doi, f'{doi}.md'))
|
||||
downloaded_json.append(new_item)
|
||||
return downloaded_json
|
||||
|
||||
def write_json_to_file(data, file_path):
|
||||
with open(file_path, 'w', encoding='utf-8') as f:
|
||||
json.dump(data, f, indent=4, ensure_ascii=False)
|
||||
|
||||
sub_folder_name_set = get_sub_folder_name_set()
|
||||
print(len(sub_folder_name_set))
|
||||
downloaded_json = get_downloaded_json(sub_folder_name_set)
|
||||
print(len(downloaded_json))
|
||||
write_json_to_file(downloaded_json, '/home/ubuntu/50T/LYT/MatBench/layer3/data/raw_small_data.json')
|
||||
56
layer3/src/rename.py
Normal file
56
layer3/src/rename.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import os
|
||||
import json
|
||||
import glob
|
||||
|
||||
with open('/home/ubuntu/50T/LYT/MatBench/layer3/downloaded_failed_papers.json', 'r') as f:
|
||||
data = json.load(f)
|
||||
|
||||
dois = []
|
||||
for item in data:
|
||||
if 'DOI' in item:
|
||||
doi = item['DOI']
|
||||
if doi not in dois:
|
||||
dois.append(doi.replace('/', '_')) # 替换斜杠以避免文件名问题
|
||||
else:
|
||||
print(f"Duplicate DOI found: {doi}")
|
||||
|
||||
# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs目录中获取所有命名范围[1-170]的PDF文件
|
||||
pdf_files = []
|
||||
for i in range(1, 170):
|
||||
pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs/{i}.pdf'
|
||||
pdf_files.extend(glob.glob(pattern))
|
||||
|
||||
|
||||
# assert len(pdf_files) == len(dois), f"Number of PDF files ({len(pdf_files)}) does not match number of DOIs ({len(dois)})"
|
||||
# # 对每个PDF文件进行重命名,从dois列表中获取对应的DOI作为新文件名
|
||||
# for i, pdf_file in enumerate(pdf_files):
|
||||
# if i < len(dois):
|
||||
# new_name = dois[i] + '.pdf'
|
||||
# new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs', new_name)
|
||||
# os.rename(pdf_file, new_path)
|
||||
# print(f'Renamed {pdf_file} to {new_path}')
|
||||
# else:
|
||||
# print(f'No DOI available for file: {pdf_file}')
|
||||
|
||||
# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds目录中获取所有命名范围[1-170]的MD文件
|
||||
md_dirs = []
|
||||
for i in range(16, 110):
|
||||
pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds/{i}'
|
||||
md_dirs.extend([pattern])
|
||||
print(len(md_dirs))
|
||||
print()
|
||||
dois = dois[16:110] # 确保DOI列表与MD目录数量匹配
|
||||
assert len(md_dirs) == len(dois), f"Number of MD directories ({len(md_dirs)}) does not match number of DOIs ({len(dois)})"
|
||||
|
||||
# 对每个MD目录进行重命名,从dois列表中获取对应的DOI作为新目录名
|
||||
for i, md_dir in enumerate(md_dirs):
|
||||
if i < len(dois):
|
||||
new_name = dois[i]
|
||||
new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds', new_name)
|
||||
if not os.path.exists(new_path):
|
||||
os.rename(md_dir, new_path)
|
||||
print(f'Renamed {md_dir} to {new_path}')
|
||||
else:
|
||||
print(f'Directory already exists: {new_path}')
|
||||
else:
|
||||
print(f'No DOI available for directory: {md_dir}')
|
||||
File diff suppressed because it is too large
Load Diff
Binary file not shown.
Reference in New Issue
Block a user