Compare commits
2 Commits
lyt
...
aade9e11cb
| Author | SHA1 | Date | |
|---|---|---|---|
| aade9e11cb | |||
| 0f781f5679 |
126
layer1/ALL-merge/classify.py
Normal file
126
layer1/ALL-merge/classify.py
Normal file
@@ -0,0 +1,126 @@
|
||||
import json
|
||||
from openai import OpenAI
|
||||
import time
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import re
|
||||
|
||||
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
|
||||
BASE_URL="https://vip.apiyi.com/v1"
|
||||
MODEL_DEEPSEEK_V3="deepseek-chat"
|
||||
CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
|
||||
# 加载JSON数据
|
||||
def load_data(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
def classify_question(idx, len,question, options):
|
||||
prompt = f"""
|
||||
Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
|
||||
Please format your response by wrapping the category name with the tags [CATEGORY] and [\CATEGORY]. For example, your response should look like one of these:\n
|
||||
- [CATEGORY]Physics[\CATEGORY]
|
||||
- [CATEGORY]Chemistry[\CATEGORY]
|
||||
- [CATEGORY]Biological[\CATEGORY]
|
||||
- [CATEGORY]Unknown[\CATEGORY]
|
||||
Question: {question}\n
|
||||
Options: {options}\n
|
||||
|
||||
"""
|
||||
client = OpenAI(api_key = API_KEY,base_url = BASE_URL)
|
||||
# 重试机制
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model= MODEL_DEEPSEEK_V3,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful educational assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
stream = False,
|
||||
)
|
||||
|
||||
classification = response.choices[0].message.content.strip()
|
||||
extracted_category = string_extraction(idx,len,classification)
|
||||
if extracted_category in CATEGORIES:
|
||||
return extracted_category
|
||||
else:
|
||||
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
|
||||
continue
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return 'Error' # 如果达到最大重试次数,返回错误
|
||||
|
||||
# 在重试之前等待
|
||||
time.sleep(2)
|
||||
|
||||
return 'Error'
|
||||
def string_extraction(idx,len,classification):
|
||||
pattern = r'\[CATEGORY\](.*?)\[\\CATEGORY\]'
|
||||
match = re.search(pattern, classification)
|
||||
print(f"{idx + 1}/{len}: {match.group(1)}")
|
||||
# if match:
|
||||
# return match.group(1)
|
||||
# else:
|
||||
# return "Unknown"
|
||||
|
||||
return match.group(1) if match else 'Unknown'
|
||||
|
||||
def main():
|
||||
# 加载数据
|
||||
file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json' # 替换为你的JSON文件路径
|
||||
data = load_data(file_path)
|
||||
data_length = len(data)
|
||||
# 创建结果列表
|
||||
results = []
|
||||
|
||||
# 处理每个问题
|
||||
for i, item in enumerate(tqdm(data, desc="Classifying questions")):
|
||||
question = item.get('question', '')
|
||||
text = item['choices']['text']
|
||||
label = item['choices']['label']
|
||||
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
|
||||
# correct_answer = item.get('correct_answer', '')
|
||||
|
||||
classification = classify_question(i, data_length,question,formatted_choices)
|
||||
# 添加分类结果
|
||||
item_with_classification = item.copy()
|
||||
item_with_classification['subject_category'] = classification
|
||||
results.append(item_with_classification)
|
||||
|
||||
# 每处理100个问题,保存一次中间结果
|
||||
if (i + 1) % 100 == 0:
|
||||
# with open('interim_results.json', 'w', encoding='utf-8') as f:
|
||||
# json.dump(results, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 可选:分析中间结果
|
||||
categories = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
|
||||
for item in results:
|
||||
categories[item.get('subject_category', 'Unknown')] += 1
|
||||
|
||||
print(f"Processed {i+1} questions. Current distribution:")
|
||||
for category, count in categories.items():
|
||||
print(f"{category}: {count}")
|
||||
|
||||
# API速率限制处理
|
||||
time.sleep(0.5)
|
||||
|
||||
# 保存最终结果
|
||||
with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 分析结果
|
||||
df = pd.DataFrame(results)
|
||||
category_counts = df['subject_category'].value_counts()
|
||||
print("\nFinal distribution of questions by category:")
|
||||
print(category_counts)
|
||||
|
||||
print("\nTask completed. Results saved to 'classified_questions.json'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
3214
layer1/ALL-merge/classify_muti.log
Normal file
3214
layer1/ALL-merge/classify_muti.log
Normal file
File diff suppressed because it is too large
Load Diff
157
layer1/ALL-merge/classify_muti.py
Normal file
157
layer1/ALL-merge/classify_muti.py
Normal file
@@ -0,0 +1,157 @@
|
||||
import json
|
||||
from openai import OpenAI
|
||||
import time
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import re
|
||||
import concurrent.futures
|
||||
import threading
|
||||
|
||||
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
|
||||
BASE_URL="https://vip.apiyi.com/v1"
|
||||
MODEL_DEEPSEEK_V3 = "deepseek-chat"
|
||||
CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
|
||||
|
||||
# Thread-local storage for OpenAI clients
|
||||
local = threading.local()
|
||||
|
||||
# Lock for thread-safe operations
|
||||
write_lock = threading.Lock()
|
||||
progress_lock = threading.Lock()
|
||||
processed_count = 0
|
||||
category_counts = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
|
||||
|
||||
# 加载JSON数据
|
||||
def load_data(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
def get_client():
|
||||
"""Get thread-local OpenAI client"""
|
||||
if not hasattr(local, 'client'):
|
||||
local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
return local.client
|
||||
|
||||
def classify_question(idx, total_len, question, options):
|
||||
prompt = f"""
|
||||
Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
|
||||
Please format your response by wrapping the category name with the tags [CATEGORY] and [/CATEGORY]. For example, your response should look like one of these:\n
|
||||
- [CATEGORY]Physics[/CATEGORY]
|
||||
- [CATEGORY]Chemistry[/CATEGORY]
|
||||
- [CATEGORY]Biological[/CATEGORY]
|
||||
- [CATEGORY]Unknown[/CATEGORY]
|
||||
Question: {question}\n
|
||||
Options: {options}\n
|
||||
"""
|
||||
|
||||
client = get_client()
|
||||
# 重试机制
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_DEEPSEEK_V3,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful educational assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
stream=False,
|
||||
)
|
||||
classification = response.choices[0].message.content.strip()
|
||||
extracted_category = string_extraction(idx, total_len, classification)
|
||||
if extracted_category in CATEGORIES:
|
||||
return extracted_category
|
||||
else:
|
||||
with progress_lock:
|
||||
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
|
||||
continue
|
||||
except Exception as e:
|
||||
with progress_lock:
|
||||
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return 'Error' # 如果达到最大重试次数,返回错误
|
||||
|
||||
# 在重试之前等待
|
||||
time.sleep(2)
|
||||
|
||||
return 'Error'
|
||||
|
||||
def string_extraction(idx, total_len, classification):
|
||||
pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
|
||||
match = re.search(pattern, classification)
|
||||
extracted = match.group(1) if match else 'Unknown'
|
||||
|
||||
with progress_lock:
|
||||
print(f"{idx + 1}/{total_len}: {extracted}")
|
||||
|
||||
return extracted
|
||||
|
||||
def process_item(args):
|
||||
idx, total_len, item = args
|
||||
question = item.get('question', '')
|
||||
text = item['choices']['text']
|
||||
label = item['choices']['label']
|
||||
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
|
||||
|
||||
classification = classify_question(idx, total_len, question, formatted_choices)
|
||||
|
||||
# 添加分类结果
|
||||
item_with_classification = item.copy()
|
||||
item_with_classification['subject_category'] = classification
|
||||
|
||||
# Update global counters
|
||||
global processed_count
|
||||
with write_lock:
|
||||
processed_count += 1
|
||||
category_counts[classification] += 1
|
||||
|
||||
# 每处理100个问题,打印一次中间结果
|
||||
if processed_count % 100 == 0:
|
||||
print(f"\nProcessed {processed_count} questions. Current distribution:")
|
||||
for category, count in category_counts.items():
|
||||
print(f"{category}: {count}")
|
||||
|
||||
# API速率限制处理 - 减少sleep时间,因为多线程已经提供了自然的延迟
|
||||
time.sleep(0.1)
|
||||
|
||||
return item_with_classification
|
||||
|
||||
def main():
|
||||
# 加载数据
|
||||
file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json'
|
||||
data = load_data(file_path)
|
||||
data_length = len(data)
|
||||
|
||||
results = []
|
||||
|
||||
# 创建参数列表
|
||||
args_list = [(i, data_length, item) for i, item in enumerate(data)]
|
||||
|
||||
# 设定线程数,根据实际API限制和服务器性能调整
|
||||
num_threads = 10 # 根据需要调整线程数
|
||||
|
||||
print(f"Starting classification with {num_threads} threads...")
|
||||
|
||||
# 使用ThreadPoolExecutor进行并行处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# 使用tqdm来显示进度
|
||||
futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
|
||||
results = futures
|
||||
|
||||
# 保存最终结果
|
||||
with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 分析结果
|
||||
df = pd.DataFrame(results)
|
||||
category_counts_final = df['subject_category'].value_counts()
|
||||
print("\nFinal distribution of questions by category:")
|
||||
print(category_counts_final)
|
||||
|
||||
print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
2289
layer2/PGEE/code/classify_muti.log
Normal file
2289
layer2/PGEE/code/classify_muti.log
Normal file
File diff suppressed because it is too large
Load Diff
169
layer2/PGEE/code/classify_muti.py
Normal file
169
layer2/PGEE/code/classify_muti.py
Normal file
@@ -0,0 +1,169 @@
|
||||
import json
|
||||
from openai import OpenAI
|
||||
import time
|
||||
import os
|
||||
from tqdm import tqdm
|
||||
import pandas as pd
|
||||
import re
|
||||
import concurrent.futures
|
||||
import threading
|
||||
|
||||
API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
|
||||
BASE_URL="https://vip.apiyi.com/v1"
|
||||
MODEL_DEEPSEEK_V3 = "deepseek-chat"
|
||||
CATEGORIES = ['Atomic Structure and Interatomic Bonding', 'The Structure of Solids', 'Imperfections in Solids', 'Mechanical Properties of Metals','Dislocations and Strengthening Mechanisms','Failure','Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties','Applications and Processing of Materials','Corrosion and Degradation of Materials','Functional Properties of Materials','Unknown']
|
||||
|
||||
# Thread-local storage for OpenAI clients
|
||||
local = threading.local()
|
||||
|
||||
# Lock for thread-safe operations
|
||||
write_lock = threading.Lock()
|
||||
progress_lock = threading.Lock()
|
||||
processed_count = 0
|
||||
category_counts = {'Atomic Structure and Interatomic Bonding': 0, 'The Structure of Solids': 0, 'Imperfections in Solids': 0, 'Mechanical Properties of Metals':0,'Dislocations and Strengthening Mechanisms':0,'Failure':0,'Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties':0,'Applications and Processing of Materials':0,'Corrosion and Degradation of Materials':0,'Functional Properties of Materials':0,'Unknown':0}
|
||||
|
||||
# 加载JSON数据
|
||||
def load_data(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
return data
|
||||
|
||||
def get_client():
|
||||
"""Get thread-local OpenAI client"""
|
||||
if not hasattr(local, 'client'):
|
||||
local.client = OpenAI(api_key=API_KEY, base_url=BASE_URL)
|
||||
return local.client
|
||||
|
||||
def classify_question(idx, total_len, question, answer):
|
||||
prompt = f"""
|
||||
Given a question and its answer from the field of Materials Science fundamentals, identify which chapter or category of Materials Science the question belongs to. Choose from the following 10 categories:
|
||||
|
||||
-- Atomic Structure and Interatomic Bonding
|
||||
-- The Structure of Solids
|
||||
-- Imperfections in Solids
|
||||
-- Mechanical Properties of Metals
|
||||
-- Dislocations and Strengthening Mechanisms
|
||||
-- Failure
|
||||
-- Phase Transformations: Development of Microstructure and Alteration of Mechanical Properties
|
||||
-- Applications and Processing of Materials
|
||||
-- Corrosion and Degradation of Materials
|
||||
-- Functional Properties of Materials
|
||||
|
||||
QUESTIONS:{question}\n
|
||||
ANSWER:{answer}\n
|
||||
|
||||
Provide your response by enclosing the category number and name within [CATEGORY] and [/CATEGORY] tags. For example: [CATEGORY]Atomic Structure and Interatomic Bonding[/CATEGORY]
|
||||
|
||||
Analyze both the question and answer carefully to determine the most appropriate category based on the question and options.
|
||||
"""
|
||||
|
||||
client = get_client()
|
||||
# 重试机制
|
||||
max_retries = 3
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
response = client.chat.completions.create(
|
||||
model=MODEL_DEEPSEEK_V3,
|
||||
messages=[
|
||||
{"role": "system", "content": "You are a helpful educational assistant."},
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
temperature=0.3,
|
||||
stream=False,
|
||||
)
|
||||
classification = response.choices[0].message.content.strip()
|
||||
extracted_category = string_extraction(idx, total_len, classification)
|
||||
if extracted_category in CATEGORIES:
|
||||
return extracted_category
|
||||
else:
|
||||
with progress_lock:
|
||||
print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
|
||||
continue
|
||||
except Exception as e:
|
||||
with progress_lock:
|
||||
print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
|
||||
if attempt == max_retries - 1:
|
||||
return 'Error' # 如果达到最大重试次数,返回错误
|
||||
|
||||
# 在重试之前等待
|
||||
time.sleep(2)
|
||||
|
||||
return 'Error'
|
||||
|
||||
def string_extraction(idx, total_len, classification):
|
||||
pattern = r'\[CATEGORY\](.*?)\[\/CATEGORY\]'
|
||||
match = re.search(pattern, classification)
|
||||
extracted = match.group(1) if match else 'Unknown'
|
||||
|
||||
with progress_lock:
|
||||
print(f"{idx + 1}/{total_len}: {extracted}")
|
||||
|
||||
return extracted
|
||||
|
||||
def process_item(args):
|
||||
idx, total_len, item = args
|
||||
question = item.get('question', '')
|
||||
text = item['choices']['text']
|
||||
label = item['choices']['label']
|
||||
# answer = item.get('correct_option','')
|
||||
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
|
||||
|
||||
classification = classify_question(idx, total_len, question, formatted_choices)
|
||||
|
||||
# 添加分类结果
|
||||
item_with_classification = item.copy()
|
||||
item_with_classification['subject_category'] = classification
|
||||
|
||||
# Update global counters
|
||||
global processed_count
|
||||
with write_lock:
|
||||
processed_count += 1
|
||||
category_counts[classification] += 1
|
||||
|
||||
# 每处理100个问题,打印一次中间结果
|
||||
if processed_count % 100 == 0:
|
||||
print(f"\nProcessed {processed_count} questions. Current distribution:")
|
||||
for category, count in category_counts.items():
|
||||
print(f"{category}: {count}")
|
||||
|
||||
# API速率限制处理 - 减少sleep时间,因为多线程已经提供了自然的延迟
|
||||
time.sleep(0.1)
|
||||
|
||||
return item_with_classification
|
||||
|
||||
def main():
|
||||
# 加载数据
|
||||
file_path = '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json'
|
||||
data = load_data(file_path)
|
||||
data_length = len(data)
|
||||
|
||||
results = []
|
||||
|
||||
# 创建参数列表
|
||||
args_list = [(i, data_length, item) for i, item in enumerate(data)]
|
||||
|
||||
# 设定线程数,根据实际API限制和服务器性能调整
|
||||
num_threads = 10 # 根据需要调整线程数
|
||||
|
||||
print(f"Starting classification with {num_threads} threads...")
|
||||
|
||||
# 使用ThreadPoolExecutor进行并行处理
|
||||
with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
|
||||
# 使用tqdm来显示进度
|
||||
futures = list(tqdm(executor.map(process_item, args_list), total=data_length, desc="Classifying questions"))
|
||||
results = futures
|
||||
|
||||
# 保存最终结果
|
||||
with open('/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json', 'w', encoding='utf-8') as f:
|
||||
json.dump(results, f, ensure_ascii=False, indent=4)
|
||||
|
||||
# 分析结果
|
||||
df = pd.DataFrame(results)
|
||||
category_counts_final = df['subject_category'].value_counts()
|
||||
print("\nFinal distribution of questions by category:")
|
||||
print(category_counts_final)
|
||||
|
||||
print("\nTask completed. Results saved to '/home/ubuntu/50T/fsy/MatBench/layer2/PGEE/code/stepz_classified.json'")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
40222
layer2/PGEE/code/stepz_classified.json
Normal file
40222
layer2/PGEE/code/stepz_classified.json
Normal file
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -513,12 +513,12 @@ def main():
|
||||
"""主函数"""
|
||||
# 文件路径配置
|
||||
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions_with_sampling.json"
|
||||
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json"
|
||||
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
|
||||
|
||||
# 难度选择比例配置
|
||||
SELECTION_RATIOS = {
|
||||
"hard_early_stop": 1.0, # 困难题选择10%
|
||||
"easy_all_correct": 0.0, # 简单题选择3.5%
|
||||
"easy_all_correct": 0.35, # 简单题选择3.5%
|
||||
"mixed": 0.0, # 混合题选择0%
|
||||
"unknown": 0.0 # 未知难度不选择
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user