Compare commits

..

2 Commits

Author SHA1 Message Date
lzy
1786688911 全部的题目 2025-06-03 11:19:36 +08:00
lzy
e4c2cfde34 分离出全部的难题 2025-06-03 10:43:44 +08:00
9 changed files with 85563 additions and 46179 deletions

View File

@@ -1,126 +0,0 @@
import json
from openai import OpenAI
import time
import os
from tqdm import tqdm
import pandas as pd
import re
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
BASE_URL="https://vip.apiyi.com/v1"
MODEL_DEEPSEEK_V3="deepseek-chat"
CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
# 加载JSON数据
def load_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def classify_question(idx, len,question, options):
prompt = f"""
Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
Please format your response by wrapping the category name with the tags [CATEGORY] and [\CATEGORY]. For example, your response should look like one of these:\n
- [CATEGORY]Physics[\CATEGORY]
- [CATEGORY]Chemistry[\CATEGORY]
- [CATEGORY]Biological[\CATEGORY]
- [CATEGORY]Unknown[\CATEGORY]
Question: {question}\n
Options: {options}\n
"""
client = OpenAI(api_key = API_KEY,base_url = BASE_URL)
# 重试机制
max_retries = 3
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model= MODEL_DEEPSEEK_V3,
messages=[
{"role": "system", "content": "You are a helpful educational assistant."},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream = False,
)
classification = response.choices[0].message.content.strip()
extracted_category = string_extraction(idx,len,classification)
if extracted_category in CATEGORIES:
return extracted_category
else:
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
continue
except Exception as e:
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
if attempt == max_retries - 1:
return 'Error' # 如果达到最大重试次数,返回错误
# 在重试之前等待
time.sleep(2)
return 'Error'
def string_extraction(idx,len,classification):
pattern = r'\[CATEGORY\](.*?)\[\\CATEGORY\]'
match = re.search(pattern, classification)
print(f"{idx + 1}/{len}: {match.group(1)}")
# if match:
# return match.group(1)
# else:
# return "Unknown"
return match.group(1) if match else 'Unknown'
def main():
# 加载数据
file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json' # 替换为你的JSON文件路径
data = load_data(file_path)
data_length = len(data)
# 创建结果列表
results = []
# 处理每个问题
for i, item in enumerate(tqdm(data, desc="Classifying questions")):
question = item.get('question', '')
text = item['choices']['text']
label = item['choices']['label']
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
# correct_answer = item.get('correct_answer', '')
classification = classify_question(i, data_length,question,formatted_choices)
# 添加分类结果
item_with_classification = item.copy()
item_with_classification['subject_category'] = classification
results.append(item_with_classification)
# 每处理100个问题保存一次中间结果
if (i + 1) % 100 == 0:
# with open('interim_results.json', 'w', encoding='utf-8') as f:
# json.dump(results, f, ensure_ascii=False, indent=4)
# 可选:分析中间结果
categories = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
for item in results:
categories[item.get('subject_category', 'Unknown')] += 1
print(f"Processed {i+1} questions. Current distribution:")
for category, count in categories.items():
print(f"{category}: {count}")
# API速率限制处理
time.sleep(0.5)
# 保存最终结果
with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
# 分析结果
df = pd.DataFrame(results)
category_counts = df['subject_category'].value_counts()
print("\nFinal distribution of questions by category:")
print(category_counts)
print("\nTask completed. Results saved to 'classified_questions.json'")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,157 +0,0 @@
import json
from openai import OpenAI
import time
import os
from tqdm import tqdm
import pandas as pd
import re
import concurrent.futures
import threading
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
BASE_URL="https://vip.apiyi.com/v1"
MODEL_DEEPSEEK_V3 = "deepseek-chat"
CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
# Thread-local storage for OpenAI clients
local = threading.local()
# Lock for thread-safe operations
write_lock = threading.Lock()
progress_lock = threading.Lock()
processed_count = 0
category_counts = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
# 加载JSON数据
def load_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def get_client():
"""Get thread-local OpenAI client"""
if not hasattr(local, 'client'):
local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
return local.client
def classify_question(idx, total_len, question, options):
prompt = f"""
Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
Please format your response by wrapping the category name with the tags [CATEGORY] and [/CATEGORY]. For example, your response should look like one of these:\n
- [CATEGORY]Physics[/CATEGORY]
- [CATEGORY]Chemistry[/CATEGORY]
- [CATEGORY]Biological[/CATEGORY]
- [CATEGORY]Unknown[/CATEGORY]
Question: {question}\n
Options: {options}\n
"""
client = get_client()
# 重试机制
max_retries = 3
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=MODEL_DEEPSEEK_V3,
messages=[
{"role": "system", "content": "You are a helpful educational assistant."},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=False,
)
classification = response.choices[0].message.content.strip()
extracted_category = string_extraction(idx, total_len, classification)
if extracted_category in CATEGORIES:
return extracted_category
else:
with progress_lock:
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
continue
except Exception as e:
with progress_lock:
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
if attempt == max_retries - 1:
return 'Error' # 如果达到最大重试次数,返回错误
# 在重试之前等待
time.sleep(2)
return 'Error'
def string_extraction(idx, total_len, classification):
pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
match = re.search(pattern, classification)
extracted = match.group(1) if match else 'Unknown'
with progress_lock:
print(f"{idx + 1}/{total_len}: {extracted}")
return extracted
def process_item(args):
idx, total_len, item = args
question = item.get('question', '')
text = item['choices']['text']
label = item['choices']['label']
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
classification = classify_question(idx, total_len, question, formatted_choices)
# 添加分类结果
item_with_classification = item.copy()
item_with_classification['subject_category'] = classification
# Update global counters
global processed_count
with write_lock:
processed_count += 1
category_counts[classification] += 1
# 每处理100个问题打印一次中间结果
if processed_count % 100 == 0:
print(f"\nProcessed {processed_count} questions. Current distribution:")
for category, count in category_counts.items():
print(f"{category}: {count}")
# API速率限制处理 - 减少sleep时间因为多线程已经提供了自然的延迟
time.sleep(0.1)
return item_with_classification
def main():
# 加载数据
file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json'
data = load_data(file_path)
data_length = len(data)
results = []
# 创建参数列表
args_list = [(i, data_length, item) for i, item in enumerate(data)]
# 设定线程数根据实际API限制和服务器性能调整
num_threads = 10 # 根据需要调整线程数
print(f"Starting classification with {num_threads} threads...")
# 使用ThreadPoolExecutor进行并行处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
# 使用tqdm来显示进度
futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
results = futures
# 保存最终结果
with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
# 分析结果
df = pd.DataFrame(results)
category_counts_final = df['subject_category'].value_counts()
print("\nFinal distribution of questions by category:")
print(category_counts_final)
print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json'")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

View File

@@ -1,169 +0,0 @@
import json
from openai import OpenAI
import time
import os
from tqdm import tqdm
import pandas as pd
import re
import concurrent.futures
import threading
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
BASE_URL="https://vip.apiyi.com/v1"
MODEL_DEEPSEEK_V3 = "deepseek-chat"
CATEGORIES = ['Atomic Structure and Interatomic Bonding', 'The Structure of Solids', 'Imperfections in Solids', 'Mechanical Properties of Metals','Dislocations and Strengthening Mechanisms','Failure','Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties','Applications and Processing of Materials','Corrosion and Degradation of Materials','Functional Properties of Materials','Unknown']
# Thread-local storage for OpenAI clients
local = threading.local()
# Lock for thread-safe operations
write_lock = threading.Lock()
progress_lock = threading.Lock()
processed_count = 0
category_counts = {'Atomic Structure and Interatomic Bonding': 0, 'The Structure of Solids': 0, 'Imperfections in Solids': 0, 'Mechanical Properties of Metals':0,'Dislocations and Strengthening Mechanisms':0,'Failure':0,'Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties':0,'Applications and Processing of Materials':0,'Corrosion and Degradation of Materials':0,'Functional Properties of Materials':0,'Unknown':0}
# 加载JSON数据
def load_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
def get_client():
"""Get thread-local OpenAI client"""
if not hasattr(local, 'client'):
local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
return local.client
def classify_question(idx, total_len, question, answer):
prompt = f"""
Given a question and its answer from the field of Materials Science fundamentals, identify which chapter or category of Materials Science the question belongs to. Choose from the following 10 categories:
-- Atomic Structure and Interatomic Bonding
-- The Structure of Solids
-- Imperfections in Solids
-- Mechanical Properties of Metals
-- Dislocations and Strengthening Mechanisms
-- Failure
-- Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties
-- Applications and Processing of Materials
-- Corrosion and Degradation of Materials
-- Functional Properties of Materials
QUESTIONS:{question}\n
ANSWER:{answer}\n
Provide your response by enclosing the category number and name within [CATEGORY] and [/CATEGORY] tags. For example: [CATEGORY]Atomic Structure and Interatomic Bonding[/CATEGORY]
Analyze both the question and answer carefully to determine the most appropriate category based on the question and options.
"""
client = get_client()
# 重试机制
max_retries = 3
for attempt in range(max_retries):
try:
response = client.chat.completions.create(
model=MODEL_DEEPSEEK_V3,
messages=[
{"role": "system", "content": "You are a helpful educational assistant."},
{"role": "user", "content": prompt}
],
temperature=0.3,
stream=False,
)
classification = response.choices[0].message.content.strip()
extracted_category = string_extraction(idx, total_len, classification)
if extracted_category in CATEGORIES:
return extracted_category
else:
with progress_lock:
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
continue
except Exception as e:
with progress_lock:
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
if attempt == max_retries - 1:
return 'Error' # 如果达到最大重试次数,返回错误
# 在重试之前等待
time.sleep(2)
return 'Error'
def string_extraction(idx, total_len, classification):
pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
match = re.search(pattern, classification)
extracted = match.group(1) if match else 'Unknown'
with progress_lock:
print(f"{idx + 1}/{total_len}: {extracted}")
return extracted
def process_item(args):
idx, total_len, item = args
question = item.get('question', '')
text = item['choices']['text']
label = item['choices']['label']
# answer = item.get('correct_option','')
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
classification = classify_question(idx, total_len, question, formatted_choices)
# 添加分类结果
item_with_classification = item.copy()
item_with_classification['subject_category'] = classification
# Update global counters
global processed_count
with write_lock:
processed_count += 1
category_counts[classification] += 1
# 每处理100个问题打印一次中间结果
if processed_count % 100 == 0:
print(f"\nProcessed {processed_count} questions. Current distribution:")
for category, count in category_counts.items():
print(f"{category}: {count}")
# API速率限制处理 - 减少sleep时间因为多线程已经提供了自然的延迟
time.sleep(0.1)
return item_with_classification
def main():
# 加载数据
file_path = '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json'
data = load_data(file_path)
data_length = len(data)
results = []
# 创建参数列表
args_list = [(i, data_length, item) for i, item in enumerate(data)]
# 设定线程数根据实际API限制和服务器性能调整
num_threads = 10 # 根据需要调整线程数
print(f"Starting classification with {num_threads} threads...")
# 使用ThreadPoolExecutor进行并行处理
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
# 使用tqdm来显示进度
futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
results = futures
# 保存最终结果
with open('/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
# 分析结果
df = pd.DataFrame(results)
category_counts_final = df['subject_category'].value_counts()
print("\nFinal distribution of questions by category:")
print(category_counts_final)
print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json'")
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -513,12 +513,12 @@ def main():
"""主函数"""
# 文件路径配置
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions_with_sampling.json"
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json"
# 难度选择比例配置
SELECTION_RATIOS = {
"hard_early_stop": 1.0, # 困难题选择10%
"easy_all_correct": 0.35, # 简单题选择3.5%
"easy_all_correct": 0.0, # 简单题选择3.5%
"mixed": 0.0, # 混合题选择0%
"unknown": 0.0 # 未知难度不选择
}