轻微修改

问题分类
2025-06-03 10:33:53 +08:00 · 2025-06-03 10:23:41 +08:00
9 changed files with 46179 additions and 85563 deletions
--- a/layer1/ALL-merge/classify.py
+++ b/layer1/ALL-merge/classify.py
@@ -0,0 +1,126 @@
+import json
+from openai import OpenAI
+import time
+import os
+from tqdm import tqdm
+import pandas as pd
+import re
+
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_DEEPSEEK_V3="deepseek-chat"
+CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
+# 加载JSON数据
+def load_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def classify_question(idx, len,question, options):
+    prompt = f"""
+    Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
+    Please format your response by wrapping the category name with the tags [CATEGORY] and [\CATEGORY]. For example, your response should look like one of these:\n
+        - [CATEGORY]Physics[\CATEGORY]
+        - [CATEGORY]Chemistry[\CATEGORY]
+        - [CATEGORY]Biological[\CATEGORY]
+        - [CATEGORY]Unknown[\CATEGORY]
+    Question: {question}\n
+    Options: {options}\n
+    
+    """
+    client = OpenAI(api_key = API_KEY,base_url = BASE_URL)
+    # 重试机制
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(
+                model= MODEL_DEEPSEEK_V3,
+                messages=[
+                    {"role": "system", "content": "You are a helpful educational assistant."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.3, 
+                stream = False,
+            )
+
+            classification = response.choices[0].message.content.strip()
+            extracted_category = string_extraction(idx,len,classification)
+            if extracted_category in CATEGORIES:
+                return extracted_category
+            else:
+                print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
+                continue
+
+        except Exception as e:
+            print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
+            if attempt == max_retries - 1:
+                return 'Error'  # 如果达到最大重试次数，返回错误
+            
+            # 在重试之前等待
+            time.sleep(2)
+    
+    return 'Error'
+def string_extraction(idx,len,classification):
+    pattern = r'\[CATEGORY\](.*?)\[\\CATEGORY\]'
+    match = re.search(pattern, classification)
+    print(f"{idx + 1}/{len}: {match.group(1)}")
+    # if match:
+    #     return match.group(1)
+    # else:
+    #     return "Unknown" 
+
+    return match.group(1) if match else 'Unknown'
+
+def main():
+    # 加载数据
+    file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json'  # 替换为你的JSON文件路径
+    data = load_data(file_path)
+    data_length = len(data)
+    # 创建结果列表
+    results = []
+    
+    # 处理每个问题
+    for i, item in enumerate(tqdm(data, desc="Classifying questions")):
+        question = item.get('question', '')
+        text = item['choices']['text']
+        label = item['choices']['label']
+        formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
+        # correct_answer = item.get('correct_answer', '')
+        
+        classification = classify_question(i, data_length,question,formatted_choices)
+        # 添加分类结果
+        item_with_classification = item.copy()
+        item_with_classification['subject_category'] = classification
+        results.append(item_with_classification)
+        
+        # 每处理100个问题，保存一次中间结果
+        if (i + 1) % 100 == 0:
+            # with open('interim_results.json', 'w', encoding='utf-8') as f:
+            #     json.dump(results, f, ensure_ascii=False, indent=4)
+            
+            # 可选：分析中间结果
+            categories = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
+            for item in results:
+                categories[item.get('subject_category', 'Unknown')] += 1
+            
+            print(f"Processed {i+1} questions. Current distribution:")
+            for category, count in categories.items():
+                print(f"{category}: {count}")
+                
+        # API速率限制处理
+        time.sleep(0.5)
+    
+    # 保存最终结果
+    with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=4)
+    
+    # 分析结果
+    df = pd.DataFrame(results)
+    category_counts = df['subject_category'].value_counts()
+    print("\nFinal distribution of questions by category:")
+    print(category_counts)
+    
+    print("\nTask completed. Results saved to 'classified_questions.json'")
+
+if __name__ == "__main__":
+    main()
--- a/layer1/ALL-merge/classify_muti.log
+++ b/layer1/ALL-merge/classify_muti.log
--- a/layer1/ALL-merge/classify_muti.py
+++ b/layer1/ALL-merge/classify_muti.py
@@ -0,0 +1,157 @@
+import json
+from openai import OpenAI
+import time
+import os
+from tqdm import tqdm
+import pandas as pd
+import re
+import concurrent.futures
+import threading
+
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_DEEPSEEK_V3 = "deepseek-chat"
+CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
+
+# Thread-local storage for OpenAI clients
+local = threading.local()
+
+# Lock for thread-safe operations
+write_lock = threading.Lock()
+progress_lock = threading.Lock()
+processed_count = 0
+category_counts = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
+
+# 加载JSON数据
+def load_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def get_client():
+    """Get thread-local OpenAI client"""
+    if not hasattr(local, 'client'):
+        local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+    return local.client
+
+def classify_question(idx, total_len, question, options):
+    prompt = f"""
+    Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
+    Please format your response by wrapping the category name with the tags [CATEGORY] and [/CATEGORY]. For example, your response should look like one of these:\n
+        - [CATEGORY]Physics[/CATEGORY]
+        - [CATEGORY]Chemistry[/CATEGORY]
+        - [CATEGORY]Biological[/CATEGORY]
+        - [CATEGORY]Unknown[/CATEGORY]
+    Question: {question}\n
+    Options: {options}\n
+    """
+    
+    client = get_client()
+    # 重试机制
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(
+                model=MODEL_DEEPSEEK_V3,
+                messages=[
+                    {"role": "system", "content": "You are a helpful educational assistant."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.3, 
+                stream=False,
+            )
+            classification = response.choices[0].message.content.strip()
+            extracted_category = string_extraction(idx, total_len, classification)
+            if extracted_category in CATEGORIES:
+                return extracted_category
+            else:
+                with progress_lock:
+                    print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
+                continue
+        except Exception as e:
+            with progress_lock:
+                print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
+            if attempt == max_retries - 1:
+                return 'Error'  # 如果达到最大重试次数，返回错误
+            
+            # 在重试之前等待
+            time.sleep(2)
+    
+    return 'Error'
+
+def string_extraction(idx, total_len, classification):
+    pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
+    match = re.search(pattern, classification)
+    extracted = match.group(1) if match else 'Unknown'
+    
+    with progress_lock:
+        print(f"{idx + 1}/{total_len}: {extracted}")
+    
+    return extracted
+
+def process_item(args):
+    idx, total_len, item = args
+    question = item.get('question', '')
+    text = item['choices']['text']
+    label = item['choices']['label']
+    formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
+    
+    classification = classify_question(idx, total_len, question, formatted_choices)
+    
+    # 添加分类结果
+    item_with_classification = item.copy()
+    item_with_classification['subject_category'] = classification
+    
+    # Update global counters
+    global processed_count
+    with write_lock:
+        processed_count += 1
+        category_counts[classification] += 1
+        
+        # 每处理100个问题，打印一次中间结果
+        if processed_count % 100 == 0:
+            print(f"\nProcessed {processed_count} questions. Current distribution:")
+            for category, count in category_counts.items():
+                print(f"{category}: {count}")
+    
+    # API速率限制处理 - 减少sleep时间，因为多线程已经提供了自然的延迟
+    time.sleep(0.1)
+    
+    return item_with_classification
+
+def main():
+    # 加载数据
+    file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json'
+    data = load_data(file_path)
+    data_length = len(data)
+    
+    results = []
+    
+    # 创建参数列表
+    args_list = [(i, data_length, item) for i, item in enumerate(data)]
+    
+    # 设定线程数，根据实际API限制和服务器性能调整
+    num_threads = 10  # 根据需要调整线程数
+    
+    print(f"Starting classification with {num_threads} threads...")
+    
+    # 使用ThreadPoolExecutor进行并行处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # 使用tqdm来显示进度
+        futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
+        results = futures
+    
+    # 保存最终结果
+    with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=4)
+    
+    # 分析结果
+    df = pd.DataFrame(results)
+    category_counts_final = df['subject_category'].value_counts()
+    print("\nFinal distribution of questions by category:")
+    print(category_counts_final)
+    
+    print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json'")
+
+if __name__ == "__main__":
+    main()
--- a/layer2/PGEE/code/classify_muti.log
+++ b/layer2/PGEE/code/classify_muti.log
--- a/layer2/PGEE/code/classify_muti.py
+++ b/layer2/PGEE/code/classify_muti.py
@@ -0,0 +1,169 @@
+import json
+from openai import OpenAI
+import time
+import os
+from tqdm import tqdm
+import pandas as pd
+import re
+import concurrent.futures
+import threading
+
+API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+BASE_URL="https://vip.apiyi.com/v1"
+MODEL_DEEPSEEK_V3 = "deepseek-chat"
+CATEGORIES = ['Atomic Structure and Interatomic Bonding', 'The Structure of Solids', 'Imperfections in Solids', 'Mechanical Properties of Metals','Dislocations and Strengthening Mechanisms','Failure','Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties','Applications and Processing of Materials','Corrosion and Degradation of Materials','Functional Properties of Materials','Unknown']
+
+# Thread-local storage for OpenAI clients
+local = threading.local()
+
+# Lock for thread-safe operations
+write_lock = threading.Lock()
+progress_lock = threading.Lock()
+processed_count = 0
+category_counts = {'Atomic Structure and Interatomic Bonding': 0, 'The Structure of Solids': 0, 'Imperfections in Solids': 0, 'Mechanical Properties of Metals':0,'Dislocations and Strengthening Mechanisms':0,'Failure':0,'Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties':0,'Applications and Processing of Materials':0,'Corrosion and Degradation of Materials':0,'Functional Properties of Materials':0,'Unknown':0}
+
+# 加载JSON数据
+def load_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+def get_client():
+    """Get thread-local OpenAI client"""
+    if not hasattr(local, 'client'):
+        local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
+    return local.client
+
+def classify_question(idx, total_len, question, answer):
+    prompt = f"""
+    Given a question and its answer from the field of Materials Science fundamentals, identify which chapter or category of Materials Science the question belongs to. Choose from the following 10 categories:
+
+    -- Atomic Structure and Interatomic Bonding
+    -- The Structure of Solids
+    -- Imperfections in Solids
+    -- Mechanical Properties of Metals
+    -- Dislocations and Strengthening Mechanisms
+    -- Failure
+    -- Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties
+    -- Applications and Processing of Materials
+    -- Corrosion and Degradation of Materials
+    -- Functional Properties of Materials
+
+    QUESTIONS:{question}\n
+    ANSWER:{answer}\n
+
+    Provide your response by enclosing the category number and name within [CATEGORY] and [/CATEGORY] tags. For example: [CATEGORY]Atomic Structure and Interatomic Bonding[/CATEGORY]
+
+    Analyze both the question and answer carefully to determine the most appropriate category based on the question and options.
+    """
+    
+    client = get_client()
+    # 重试机制
+    max_retries = 3
+    for attempt in range(max_retries):
+        try:
+            response = client.chat.completions.create(
+                model=MODEL_DEEPSEEK_V3,
+                messages=[
+                    {"role": "system", "content": "You are a helpful educational assistant."},
+                    {"role": "user", "content": prompt}
+                ],
+                temperature=0.3, 
+                stream=False,
+            )
+            classification = response.choices[0].message.content.strip()
+            extracted_category = string_extraction(idx, total_len, classification)
+            if extracted_category in CATEGORIES:
+                return extracted_category
+            else:
+                with progress_lock:
+                    print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
+                continue
+        except Exception as e:
+            with progress_lock:
+                print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
+            if attempt == max_retries - 1:
+                return 'Error'  # 如果达到最大重试次数，返回错误
+            
+            # 在重试之前等待
+            time.sleep(2)
+    
+    return 'Error'
+
+def string_extraction(idx, total_len, classification):
+    pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
+    match = re.search(pattern, classification)
+    extracted = match.group(1) if match else 'Unknown'
+    
+    with progress_lock:
+        print(f"{idx + 1}/{total_len}: {extracted}")
+    
+    return extracted
+
+def process_item(args):
+    idx, total_len, item = args
+    question = item.get('question', '')
+    text = item['choices']['text']
+    label = item['choices']['label']
+    # answer = item.get('correct_option','')
+    formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
+    
+    classification = classify_question(idx, total_len, question, formatted_choices)
+    
+    # 添加分类结果
+    item_with_classification = item.copy()
+    item_with_classification['subject_category'] = classification
+    
+    # Update global counters
+    global processed_count
+    with write_lock:
+        processed_count += 1
+        category_counts[classification] += 1
+        
+        # 每处理100个问题，打印一次中间结果
+        if processed_count % 100 == 0:
+            print(f"\nProcessed {processed_count} questions. Current distribution:")
+            for category, count in category_counts.items():
+                print(f"{category}: {count}")
+    
+    # API速率限制处理 - 减少sleep时间，因为多线程已经提供了自然的延迟
+    time.sleep(0.1)
+    
+    return item_with_classification
+
+def main():
+    # 加载数据
+    file_path = '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json'
+    data = load_data(file_path)
+    data_length = len(data)
+    
+    results = []
+    
+    # 创建参数列表
+    args_list = [(i, data_length, item) for i, item in enumerate(data)]
+    
+    # 设定线程数，根据实际API限制和服务器性能调整
+    num_threads = 10  # 根据需要调整线程数
+    
+    print(f"Starting classification with {num_threads} threads...")
+    
+    # 使用ThreadPoolExecutor进行并行处理
+    with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
+        # 使用tqdm来显示进度
+        futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
+        results = futures
+    
+    # 保存最终结果
+    with open('/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json', 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=4)
+    
+    # 分析结果
+    df = pd.DataFrame(results)
+    category_counts_final = df['subject_category'].value_counts()
+    print("\nFinal distribution of questions by category:")
+    print(category_counts_final)
+    
+    print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json'")
+
+if __name__ == "__main__":
+    main()
--- a/layer2/PGEE/code/stepz_classified.json
+++ b/layer2/PGEE/code/stepz_classified.json
--- a/layer2/PGEE/code/stepz_final_choice_questions_filtered_full.json
+++ b/layer2/PGEE/code/stepz_final_choice_questions_filtered_full.json
--- a/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json
+++ b/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json
--- a/layer2/PGEE/code/stepz_final_format_convert.py
+++ b/layer2/PGEE/code/stepz_final_format_convert.py
@@ -513,12 +513,12 @@ def main():
    """主函数"""
    # 文件路径配置
    INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions_with_sampling.json"
-    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json"
+    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
    
    # 难度选择比例配置
    SELECTION_RATIOS = {
        "hard_early_stop": 1.0,     # 困难题选择10%
-        "easy_all_correct": 0.0,  # 简单题选择3.5%
+        "easy_all_correct": 0.35,  # 简单题选择3.5%
        "mixed": 0.0,               # 混合题选择0%
        "unknown": 0.0              # 未知难度不选择
    }
Author	SHA1	Message	Date
lzy	aade9e11cb	轻微修改	2025-06-03 10:33:53 +08:00
lzy	0f781f5679	问题分类	2025-06-03 10:23:41 +08:00