MatBench/layer2/rubbish/single_select.py

import json
import time
from openai import OpenAI
import re
client = OpenAI(
    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
    base_url="https://vip.apiyi.com/v1"
)

def load_qa_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 对问题进行拆分
def split_complex_question(question, answer):
    prompt = f"""
                Follow these instructions strictly to perform question decomposition:
                Input requirements:
                - Question text: {question}
                - Answer text: {answer}
                Output rules:
                1. Single issue determination criteria:
                - Question contains only one clear technical inquiry point
                - Answer content cannot be divided into independent parts
                → Return: "It's a single issue."
                2. Compound question decomposition criteria (must satisfy all):
                a) Question contains multiple technically independent sub-questions
                b) Answer contains independent solution paragraphs corresponding to sub-questions
                c) Each sub-question's answer does not depend on context from other sub-questions
                3. Decomposition format standards:
                [
                {{
                    "question": "[Complete sub-question 1] (including necessary shared parameters)",
                    "answer": "[Corresponding complete answer]"
                }},
                {{
                    "question": "[Complete sub-question 2] (including necessary shared parameters)",
                    "answer": "[Corresponding complete answer]"
                }},
                ......
                ]
                Key control points:
                1. Context integrity:
                - Each sub-question must include shared parameters from the original question
                2. Answer integrity:
                - Preserve final calculation results
                - Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)

                3. Format prohibitions:
                - No explanatory text additions
                - No modifications to original technical terminology
                - Return data must not use Markdown and Latex formats (like \times, \mathrm)
                - Use scientific notation for data representation
                """

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are a helpful assistant"},
                {"role": "user", "content": prompt}
            ],
            stream=False
        )
        result = response.choices[0].message.content.strip()
        print(result)
        if "It's a single issue." in result:
            return 1
        else:
            return json.loads(process_response(result))
    except Exception as e:
        print(f"API调用错误: {e}")
        # 如果API调用失败，返回原问题
        return [{"question": question, "answer": answer}]

def process_response(response_text):
    if response_text.strip().startswith("```json") and response_text.strip().endswith("```"):
        json_text = response_text.strip()[7:-3].strip()
        return json_text
    else:
        return response_text

def process_dataset(data):
    processed_data = []
    idx = 1
    total = len(data)

    for i, item in enumerate(data):
        print(f"处理第 {i+1}/{total} 条数据...")
        question = item["question"]
        answer = item["answer"]
        split_data = split_complex_question(question, answer)

        if isinstance(split_data, list):
            for q_data in split_data:
                processed_data.append({
                    "idx": idx,
                    "question": q_data["question"],
                    "answer": q_data["answer"]
                })
                idx += 1
        else:  # 如果API返回了意外的数据格式作为后备处理
            processed_data.append({
                "idx": idx,
                "question": question,
                "answer": answer
            })
            idx += 1

        if (i+1) % 10 == 0:
            time.sleep(2)

    return processed_data

def save_processed_data(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    input_file = "/home/ubuntu/50T/fsy/benchmark/only_answer.json"
    output_file = "single_select.json"
    data = load_qa_data(input_file)
    processed_data = process_dataset(data)
    save_processed_data(processed_data, output_file)
    print(f"处理完成，结果已保存到 {output_file}")

if __name__ == "__main__":
    main()