Merge pull request 'lyt1' (#1) from lyt1 into main

Reviewed-on: #1
2025-06-12 20:46:58 +08:00
parent 39dc1e9f06 38dce859dd
commit 18737fe2f4
12 changed files with 66321 additions and 602292 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,7 @@
 MatBench/layer3/articles_fsy
 *.zip
 *.temp
-*.pyc
+*.pyc
+*.pdf
+*.md
+layer3/articles/*
--- a/layer2/PGEE/code/stepz_final_format_convert.py
+++ b/layer2/PGEE/code/stepz_final_format_convert.py
@@ -518,7 +518,7 @@ def main():
    # 难度选择比例配置
    SELECTION_RATIOS = {
        "hard_early_stop": 1.0,     # 困难题选择10%
-        "easy_all_correct": 0.0,  # 简单题选择3.5%
+        "easy_all_correct": 0.0,    # 简单题选择3.5%
        "mixed": 0.0,               # 混合题选择0%
        "unknown": 0.0              # 未知难度不选择
    }
--- a/layer3/data/downloaded_failed_papers.json
+++ b/layer3/data/downloaded_failed_papers.json
--- a/layer3/data/paper.xls
+++ b/layer3/data/paper.xls
--- a/layer3/data/raw_data.json
+++ b/layer3/data/raw_data.json
--- a/layer3/data/raw_small_data.json
+++ b/layer3/data/raw_small_data.json
--- a/layer3/data/top_cited_paper_mat_nooverview.json
+++ b/layer3/data/top_cited_paper_mat_nooverview.json
--- a/layer3/paper-mat.json
+++ b/layer3/paper-mat.json
--- a/layer3/src/getdownloadedjson.py
+++ b/layer3/src/getdownloadedjson.py
@@ -0,0 +1,58 @@
+
+import os
+import json
+import re
+
+# 读/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds下所有子文件夹的文件夹名
+def get_sub_folder_name_set():
+    path = '/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds'
+    sub_folder_name = []
+    
+    # 遍历目录下的所有子文件夹
+    for folder in os.listdir(path):
+        sub_folder_name.append(folder)
+    
+    return set(sub_folder_name)
+
+def read_markdown_file(file_path):
+    with open(file_path, 'r', encoding='utf-8') as file:
+        content = file.read()
+    return content
+
+def get_downloaded_json(sub_folder_name_set, abs_path='/home/ubuntu/50T/LYT/MatBench/layer3/articles'):
+    # 获取已下载的json文件名
+    downloaded_json = []
+    path = '/home/ubuntu/50T/LYT/MatBench/layer3/data/top_cited_paper_mat_nooverview.json'
+    with open(path, 'r') as f:
+        data = json.load(f)
+        for item in data:
+            # 提取文件名
+            doi = item['DOI'].replace('/', '_')
+            new_item = {}
+            if doi in sub_folder_name_set:
+                new_item['id'] = doi
+                new_item['DOI'] = item['DOI']
+                new_item['DOI Link'] = item['DOI Link']
+                new_item['Relative Dir Path'] = f'mds/{doi}'
+                new_item['Article Title'] = item['Article Title']
+                new_item['Authors'] = item['Authors']
+                new_item['Source Title'] = item['Source Title']
+                new_item['Abstract'] = item['Abstract']
+                new_item['Times Cited, WoS Core'] = item['Times Cited, WoS Core']
+                new_item['Times Cited, All Databases'] = item['Times Cited, All Databases']
+                new_item['Publication Year'] = item['Publication Year']
+                new_item['Research Areas'] = item['Research Areas']
+                new_item['UT (Unique WOS ID)'] = item['UT (Unique WOS ID)']
+                new_item['Markdown'] = read_markdown_file(os.path.join(abs_path, 'mds', doi, f'{doi}.md'))
+                downloaded_json.append(new_item)
+    return downloaded_json
+
+def write_json_to_file(data, file_path):
+    with open(file_path, 'w', encoding='utf-8') as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+
+sub_folder_name_set = get_sub_folder_name_set()
+print(len(sub_folder_name_set))
+downloaded_json = get_downloaded_json(sub_folder_name_set)
+print(len(downloaded_json))
+write_json_to_file(downloaded_json, '/home/ubuntu/50T/LYT/MatBench/layer3/data/raw_small_data.json')
--- a/layer3/src/rename.py
+++ b/layer3/src/rename.py
@@ -0,0 +1,56 @@
+import os
+import json
+import glob
+
+with open('/home/ubuntu/50T/LYT/MatBench/layer3/downloaded_failed_papers.json', 'r') as f:
+    data = json.load(f)
+
+dois = []
+for item in data:
+    if 'DOI' in item:
+        doi = item['DOI']
+        if doi not in dois:
+            dois.append(doi.replace('/', '_'))  # 替换斜杠以避免文件名问题
+        else:
+            print(f"Duplicate DOI found: {doi}")
+
+# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs目录中获取所有命名范围[1-170]的PDF文件
+pdf_files = []
+for i in range(1, 170): 
+    pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs/{i}.pdf'
+    pdf_files.extend(glob.glob(pattern))
+
+
+# assert len(pdf_files) == len(dois), f"Number of PDF files ({len(pdf_files)}) does not match number of DOIs ({len(dois)})"
+# # 对每个PDF文件进行重命名，从dois列表中获取对应的DOI作为新文件名
+# for i, pdf_file in enumerate(pdf_files):
+#     if i < len(dois):
+#         new_name = dois[i] + '.pdf'
+#         new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/pdfs', new_name)
+#         os.rename(pdf_file, new_path)
+#         print(f'Renamed {pdf_file} to {new_path}')
+#     else:
+#         print(f'No DOI available for file: {pdf_file}')
+
+# 从/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds目录中获取所有命名范围[1-170]的MD文件
+md_dirs = []
+for i in range(16, 110): 
+    pattern = f'/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds/{i}'
+    md_dirs.extend([pattern])
+print(len(md_dirs))
+print()
+dois = dois[16:110]  # 确保DOI列表与MD目录数量匹配
+assert len(md_dirs) == len(dois), f"Number of MD directories ({len(md_dirs)}) does not match number of DOIs ({len(dois)})"
+
+# 对每个MD目录进行重命名，从dois列表中获取对应的DOI作为新目录名
+for i, md_dir in enumerate(md_dirs):
+    if i < len(dois):
+        new_name = dois[i]
+        new_path = os.path.join('/home/ubuntu/50T/LYT/MatBench/layer3/articles/mds', new_name)
+        if not os.path.exists(new_path):
+            os.rename(md_dir, new_path)
+            print(f'Renamed {md_dir} to {new_path}')
+        else:
+            print(f'Directory already exists: {new_path}')
+    else:
+        print(f'No DOI available for directory: {md_dir}')
--- a/layer3/top_cited_papers_2015_2024.json
+++ b/layer3/top_cited_papers_2015_2024.json
--- a/results/20250602_1706/summary.xlsx
+++ b/results/20250602_1706/summary.xlsx