Merge pull request 'lyt1' (#1) from lyt1 into main

Reviewed-on: #1
This commit is contained in:
2025-06-12 20:46:58 +08:00
12 changed files with 66321 additions and 602292 deletions

5
.gitignore vendored
View File

@@ -1,4 +1,7 @@
MatBench/layer3/articles_fsy
*.zip
*.temp
*.pyc
*.pyc
*.pdf
*.md
layer3/articles/*

View File

@@ -518,7 +518,7 @@ def main():
# 难度选择比例配置
SELECTION_RATIOS = {
"hard_early_stop": 1.0, # 困难题选择10%
"easy_all_correct": 0.0, # 简单题选择3.5%
"easy_all_correct": 0.0, # 简单题选择3.5%
"mixed": 0.0, # 混合题选择0%
"unknown": 0.0 # 未知难度不选择
}

File diff suppressed because it is too large Load Diff

44538
layer3/data/raw_data.json Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,58 @@
import os
import json
import re
# 读/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds下所有子文件夹的文件夹名
def get_sub_folder_name_set():
path = '/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds'
sub_folder_name = []
# 遍历目录下的所有子文件夹
for folder in os.listdir(path):
sub_folder_name.append(folder)
return set(sub_folder_name)
def read_markdown_file(file_path):
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
return content
def get_downloaded_json(sub_folder_name_set, abs_path='/home/ubuntu/50T/LYT/MatBench/layer3/articles'):
# 获取已下载的json文件名
downloaded_json = []
path = '/home/ubuntu/50T/LYT/MatBench/layer3/data/top_cited_paper_mat_nooverview.json'
with open(path, 'r') as f:
data = json.load(f)
for item in data:
# 提取文件名
doi = item['DOI'].replace('/', '_')
new_item = {}
if doi in sub_folder_name_set:
new_item['id'] = doi
new_item['DOI'] = item['DOI']
new_item['DOI Link'] = item['DOI Link']
new_item['Relative Dir Path'] = f'mds/{doi}'
new_item['Article Title'] = item['Article Title']
new_item['Authors'] = item['Authors']
new_item['Source Title'] = item['Source Title']
new_item['Abstract'] = item['Abstract']
new_item['Times Cited, WoS Core'] = item['Times Cited, WoS Core']
new_item['Times Cited, All Databases'] = item['Times Cited, All Databases']
new_item['Publication Year'] = item['Publication Year']
new_item['Research Areas'] = item['Research Areas']
new_item['UT (Unique WOS ID)'] = item['UT (Unique WOS ID)']
new_item['Markdown'] = read_markdown_file(os.path.join(abs_path, 'mds', doi, f'{doi}.md'))
downloaded_json.append(new_item)
return downloaded_json
def write_json_to_file(data, file_path):
with open(file_path, 'w', encoding='utf-8') as f:
json.dump(data, f, indent=4, ensure_ascii=False)
sub_folder_name_set = get_sub_folder_name_set()
print(len(sub_folder_name_set))
downloaded_json = get_downloaded_json(sub_folder_name_set)
print(len(downloaded_json))
write_json_to_file(downloaded_json, '/home/ubuntu/50T/LYT/MatBench/layer3/data/raw_small_data.json')

56
layer3/src/rename.py Normal file
View File

@@ -0,0 +1,56 @@
import os
import json
import glob
with open('/home/ubuntu/50T/LYT/MatBench/layer3/downloaded_failed_papers.json', 'r') as f:
data = json.load(f)
dois = []
for item in data:
if 'DOI' in item:
doi = item['DOI']
if doi not in dois:
dois.append(doi.replace('/', '_')) # 替换斜杠以避免文件名问题
else:
print(f"Duplicate DOI found: {doi}")
# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs目录中获取所有命名范围[1-170]的PDF文件
pdf_files = []
for i in range(1, 170):
pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs/{i}.pdf'
pdf_files.extend(glob.glob(pattern))
# assert len(pdf_files) == len(dois), f"Number of PDF files ({len(pdf_files)}) does not match number of DOIs ({len(dois)})"
# # 对每个PDF文件进行重命名从dois列表中获取对应的DOI作为新文件名
# for i, pdf_file in enumerate(pdf_files):
# if i < len(dois):
# new_name = dois[i] + '.pdf'
# new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs', new_name)
# os.rename(pdf_file, new_path)
# print(f'Renamed {pdf_file} to {new_path}')
# else:
# print(f'No DOI available for file: {pdf_file}')
# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds目录中获取所有命名范围[1-170]的MD文件
md_dirs = []
for i in range(16, 110):
pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds/{i}'
md_dirs.extend([pattern])
print(len(md_dirs))
print()
dois = dois[16:110] # 确保DOI列表与MD目录数量匹配
assert len(md_dirs) == len(dois), f"Number of MD directories ({len(md_dirs)}) does not match number of DOIs ({len(dois)})"
# 对每个MD目录进行重命名从dois列表中获取对应的DOI作为新目录名
for i, md_dir in enumerate(md_dirs):
if i < len(dois):
new_name = dois[i]
new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds', new_name)
if not os.path.exists(new_path):
os.rename(md_dir, new_path)
print(f'Renamed {md_dir} to {new_path}')
else:
print(f'Directory already exists: {new_path}')
else:
print(f'No DOI available for directory: {md_dir}')

File diff suppressed because it is too large Load Diff

Binary file not shown.