MatBench/layer2/PGEE/code/step1_single_question.py

"""
0. 将问题从xls提取为json
1. 将问题进行拆分
2. 翻译成英文
3. 去重
4. 使用大模型进行难度评估和筛选
"""
import json
import time
from openai import OpenAI
import re

client = OpenAI(
    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
    base_url="https://vip.apiyi.com/v1"
)

def load_qa_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# 对问题进行拆分
'''
保留计算题的计算过程：- Fully preserve the step-by-step calculation process along with the final results
只保留计算题的结果：- Preserve final calculation results
'''
def split_complex_question(question, answer):
    prompt = f"""
                Follow these instructions strictly to perform question decomposition:
                Input requirements:
                - Question text: {question}
                - Answer text: {answer}
                Output rules:
                1. Single issue determination criteria:
                - Question contains only one clear technical inquiry point
                - Answer content cannot be divided into independent parts
                → Return: "It's a single issue."
                2. Compound question decomposition criteria (must satisfy all):
                a) Question contains multiple technically independent sub-questions
                b) Answer contains independent solution paragraphs corresponding to sub-questions
                c) Each sub-question's answer does not depend on context from other sub-questions
                3. Decomposition format standards:
                [
                    {{
                        "question": "[Complete sub-question 1] (including necessary shared parameters)",
                        "answer": "[Corresponding complete answer]"
                    }},
                    {{
                        "question": "[Complete sub-question 2] (including necessary shared parameters)",
                        "answer": "[Corresponding complete answer]"
                    }},
                    ......
                ]
                Key control points:
                1. Context integrity:
                - Each sub-question must include shared parameters from the original question
                2. Answer integrity:
                - Fully preserve the step-by-step calculation process along with the final results
                - Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)
                3.

                3. Format prohibitions:
                - No explanatory text additions
                - No modifications to original technical terminology
                - Return data must not use Markdown and Latex formats (like \times, \mathrm)
                - Use scientific notation for data representation
                """

    try:
        response = client.chat.completions.create(
            model="deepseek-chat",
            messages=[
                {"role": "system", "content": "You are an expert in decomposing complex technical questions into independent sub-questions and providing corresponding complete answers with preserved context, precision, and technical terminology. "},
                {"role": "user", "content": prompt}
            ],
            stream = False,
            temperature = 0
        )
        result = response.choices[0].message.content.strip()
        print(result)
        if "It's a single issue." in result:
            return 1
        else:
            return json.loads(process_response(result))
    except Exception as e:
        print(f"API调用错误: {e}")
        # 如果API调用失败，返回原问题
        return [{"question": question, "answer": answer}]

def process_response(response):
    """Extract and parse JSON from a response."""
    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response)
    json_str = json_match.group(1) if json_match else response.strip()
    json_str = re.sub(r'(\$[^\$]*\$)', lambda m: m.group(1).replace('\\', '\\\\'), json_str)
    json_str = json_str.replace('\\"', '"').replace("\\'", "'")

    return json_str

def process_dataset(data):
    processed_data = []
    total = len(data)
    for i, item in enumerate(data):
        print(f"处理第 {i+1}/{total} 条数据...")
        question = item["question"]
        answer = item["answer"]
        split_data = split_complex_question(question, answer)

        if isinstance(split_data, list):
            for q_data in split_data:
                processed_data.append({
                    "idx":item["idx"],
                    "question": q_data["question"],
                    "answer": q_data["answer"]
                })
        else:
            processed_data.append({
                "idx":item["idx"],
                "question": question,
                "answer": answer
            })

        if (i+1) % 10 == 0:
            time.sleep(2)

    return processed_data

def save_processed_data(data, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def main():
    input_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821.json"
    output_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select_includes_process.json"
    data = load_qa_data(input_file)
    processed_data = process_dataset(data)
    save_processed_data(processed_data, output_file)
    print(f"处理完成，结果已保存到 {output_file}")

if __name__ == "__main__":
    main()