MatBench/layer1/ALL-merge/classify.py

import json
from openai import OpenAI
import time
import os
from tqdm import tqdm
import pandas as pd
import re

API_KEY="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
BASE_URL="https://vip.apiyi.com/v1"
MODEL_DEEPSEEK_V3="deepseek-chat"
CATEGORIES = ['Physics', 'Chemistry', 'Biological', 'Unknown']
# 加载JSON数据
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

def classify_question(idx, len,question, options):
    prompt = f"""
    Please classify the given question into one of these three categories: 'Physics', 'Chemistry', 'Biological' or 'Unknown'.\n
    Please format your response by wrapping the category name with the tags [CATEGORY] and [\CATEGORY]. For example, your response should look like one of these:\n
        - [CATEGORY]Physics[\CATEGORY]
        - [CATEGORY]Chemistry[\CATEGORY]
        - [CATEGORY]Biological[\CATEGORY]
        - [CATEGORY]Unknown[\CATEGORY]
    Question: {question}\n
    Options: {options}\n

    """
    client = OpenAI(api_key = API_KEY,base_url = BASE_URL)
    # 重试机制
    max_retries = 3
    for attempt in range(max_retries):
        try:
            response = client.chat.completions.create(
                model= MODEL_DEEPSEEK_V3,
                messages=[
                    {"role": "system", "content": "You are a helpful educational assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3,
                stream = False,
            )

            classification = response.choices[0].message.content.strip()
            extracted_category = string_extraction(idx,len,classification)
            if extracted_category in CATEGORIES:
                return extracted_category
            else:
                print(f"Invalid category '{extracted_category}' returned. Retrying. {attempt + 1}/{max_retries}")
                continue

        except Exception as e:
            print(f"Error on attempt {attempt + 1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                return 'Error'  # 如果达到最大重试次数，返回错误

            # 在重试之前等待
            time.sleep(2)

    return 'Error'
def string_extraction(idx,len,classification):
    pattern = r'\[CATEGORY\](.*?)\[\\CATEGORY\]'
    match = re.search(pattern, classification)
    print(f"{idx + 1}/{len}: {match.group(1)}")
    # if match:
    #     return match.group(1)
    # else:
    #     return "Unknown"

    return match.group(1) if match else 'Unknown'

def main():
    # 加载数据
    file_path = '/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged.json'  # 替换为你的JSON文件路径
    data = load_data(file_path)
    data_length = len(data)
    # 创建结果列表
    results = []

    # 处理每个问题
    for i, item in enumerate(tqdm(data, desc="Classifying questions")):
        question = item.get('question', '')
        text = item['choices']['text']
        label = item['choices']['label']
        formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
        # correct_answer = item.get('correct_answer', '')

        classification = classify_question(i, data_length,question,formatted_choices)
        # 添加分类结果
        item_with_classification = item.copy()
        item_with_classification['subject_category'] = classification
        results.append(item_with_classification)

        # 每处理100个问题，保存一次中间结果
        if (i + 1) % 100 == 0:
            # with open('interim_results.json', 'w', encoding='utf-8') as f:
            #     json.dump(results, f, ensure_ascii=False, indent=4)

            # 可选：分析中间结果
            categories = {'Physics': 0, 'Chemistry': 0, 'Biological': 0, 'Unknown': 0}
            for item in results:
                categories[item.get('subject_category', 'Unknown')] += 1

            print(f"Processed {i+1} questions. Current distribution:")
            for category, count in categories.items():
                print(f"{category}: {count}")

        # API速率限制处理
        time.sleep(0.5)

    # 保存最终结果
    with open('/home/ubuntu/50T/fsy/MatBench/layer1/ALL-merge/merged_classified.json', 'w', encoding='utf-8') as f:
        json.dump(results, f, ensure_ascii=False, indent=4)

    # 分析结果
    df = pd.DataFrame(results)
    category_counts = df['subject_category'].value_counts()
    print("\nFinal distribution of questions by category:")
    print(category_counts)

    print("\nTask completed. Results saved to 'classified_questions.json'")

if __name__ == "__main__":
    main()