second commit

2025-05-28 10:55:34 +08:00
parent 0f80316f8b
commit ef9355f2f5
73 changed files with 485583 additions and 0 deletions
--- a/layer2/PGEE/code/step1_single_question.py
+++ b/layer2/PGEE/code/step1_single_question.py
@@ -0,0 +1,141 @@
+"""
+0. 将问题从xls提取为json
+1. 将问题进行拆分
+2. 翻译成英文
+3. 去重
+4. 使用大模型进行难度评估和筛选
+"""
+import json
+import time
+from openai import OpenAI
+import re
+
+client = OpenAI(
+    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
+    base_url="https://vip.apiyi.com/v1"
+)
+
+def load_qa_data(file_path):
+    with open(file_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    return data
+
+# 对问题进行拆分
+'''
+保留计算题的计算过程：- Fully preserve the step-by-step calculation process along with the final results
+只保留计算题的结果：- Preserve final calculation results
+'''
+def split_complex_question(question, answer):
+    prompt = f"""
+                Follow these instructions strictly to perform question decomposition:
+                Input requirements:
+                - Question text: {question}
+                - Answer text: {answer}
+                Output rules:
+                1. Single issue determination criteria:
+                - Question contains only one clear technical inquiry point
+                - Answer content cannot be divided into independent parts
+                → Return: "It's a single issue."
+                2. Compound question decomposition criteria (must satisfy all):
+                a) Question contains multiple technically independent sub-questions
+                b) Answer contains independent solution paragraphs corresponding to sub-questions
+                c) Each sub-question's answer does not depend on context from other sub-questions
+                3. Decomposition format standards:
+                [
+                    {{
+                        "question": "[Complete sub-question 1] (including necessary shared parameters)",
+                        "answer": "[Corresponding complete answer]"
+                    }},
+                    {{
+                        "question": "[Complete sub-question 2] (including necessary shared parameters)",
+                        "answer": "[Corresponding complete answer]"
+                    }},
+                    ......
+                ]
+                Key control points:
+                1. Context integrity:
+                - Each sub-question must include shared parameters from the original question
+                2. Answer integrity:
+                - Fully preserve the step-by-step calculation process along with the final results
+                - Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)
+                3. 
+
+                3. Format prohibitions:
+                - No explanatory text additions
+                - No modifications to original technical terminology
+                - Return data must not use Markdown and Latex formats (like \times, \mathrm)
+                - Use scientific notation for data representation
+                """
+
+    try:
+        response = client.chat.completions.create(
+            model="deepseek-chat",
+            messages=[
+                {"role": "system", "content": "You are an expert in decomposing complex technical questions into independent sub-questions and providing corresponding complete answers with preserved context, precision, and technical terminology. "},
+                {"role": "user", "content": prompt}
+            ],
+            stream = False,
+            temperature = 0
+        )
+        result = response.choices[0].message.content.strip()
+        print(result)
+        if "It's a single issue." in result:
+            return 1
+        else:
+            return json.loads(process_response(result))
+    except Exception as e:
+        print(f"API调用错误: {e}")
+        # 如果API调用失败，返回原问题
+        return [{"question": question, "answer": answer}]
+
+def process_response(response):
+    """Extract and parse JSON from a response."""
+    json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response)
+    json_str = json_match.group(1) if json_match else response.strip()
+    json_str = re.sub(r'(\$[^\$]*\$)', lambda m: m.group(1).replace('\\', '\\\\'), json_str)
+    json_str = json_str.replace('\\"', '"').replace("\\'", "'")
+    
+    return json_str
+
+def process_dataset(data):
+    processed_data = []
+    total = len(data)
+    for i, item in enumerate(data):
+        print(f"处理第 {i+1}/{total} 条数据...")
+        question = item["question"]
+        answer = item["answer"]
+        split_data = split_complex_question(question, answer)
+
+        if isinstance(split_data, list):
+            for q_data in split_data:
+                processed_data.append({
+                    "idx":item["idx"],
+                    "question": q_data["question"],
+                    "answer": q_data["answer"]
+                })
+        else:
+            processed_data.append({
+                "idx":item["idx"],
+                "question": question,
+                "answer": answer
+            })
+
+        if (i+1) % 10 == 0:
+            time.sleep(2)
+
+    return processed_data
+
+def save_processed_data(data, output_file):
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)
+
+def main():
+    input_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821.json"
+    output_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select_includes_process.json"
+    data = load_qa_data(input_file)
+    processed_data = process_dataset(data)
+    save_processed_data(processed_data, output_file)
+    print(f"处理完成，结果已保存到 {output_file}")
+
+if __name__ == "__main__":
+    main()