格式转换

2025-05-29 20:18:57 +08:00
parent 1156bfdd7c
commit 6c87af5614
14 changed files with 11996 additions and 13 deletions
--- a/eval_framework/config/config.yaml
+++ b/eval_framework/config/config.yaml
@@ -2,22 +2,25 @@
 api:
  key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
  base_url: "https://vip.apiyi.com/v1"
-  temperature: 0
+  temperature: -1 # 默认使用模型的温度设置
  max_retries: 10
  # 支持多个模型
  models:
    - "qwen-max-2025-01-25"
    - "gpt-4o"
+    - "deepseek-chat"
+    - "claude-sonnet-4-20250514"
+    - "deepseek-r1"
  # 或者使用单个模型（向后兼容）
  # model: "qwen-max-2025-01-25"

-# 系统提示词
-system_prompt: "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."
+system_prompt: None

 # 评估配置
 evaluation:
-  max_workers: 8
-  input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
+  max_workers: 20
+  # input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
+  input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
  # 输出配置
  output:
    base_dir: "results"
--- a/eval_framework/main.py
+++ b/eval_framework/main.py
@@ -144,7 +144,7 @@ def main():
            logger.info(f"Evaluating model {i}/{len(models)}: {model_name}")
            
            try:
-                model_result = evaluate_single_model(model_name, data[:10], config, output_dir)
+                model_result = evaluate_single_model(model_name, data, config, output_dir)
                all_results[model_name] = model_result
                
                # 打印当前模型的结果
--- a/eval_framework/src/pycache/init.cpython-311.pyc
+++ b/eval_framework/src/pycache/init.cpython-311.pyc
--- a/eval_framework/src/pycache/data_loader.cpython-311.pyc
+++ b/eval_framework/src/pycache/data_loader.cpython-311.pyc
--- a/eval_framework/src/pycache/evaluator.cpython-311.pyc
+++ b/eval_framework/src/pycache/evaluator.cpython-311.pyc
--- a/eval_framework/src/pycache/llm_client.cpython-311.pyc
+++ b/eval_framework/src/pycache/llm_client.cpython-311.pyc
--- a/eval_framework/src/pycache/metrics.cpython-311.pyc
+++ b/eval_framework/src/pycache/metrics.cpython-311.pyc
--- a/eval_framework/src/pycache/utils.cpython-311.pyc
+++ b/eval_framework/src/pycache/utils.cpython-311.pyc
--- a/eval_framework/src/evaluator.py
+++ b/eval_framework/src/evaluator.py
@@ -51,7 +51,7 @@ class Evaluator:

        # 格式化选择项
        formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
-        user_input = f"{question} {formatted_choices}. {prompt}"
+        user_input = f"{prompt} \n {question} {formatted_choices}"
        
        # 获取LLM响应
        llm_answer = self.llm_client.get_response(user_input, self.system_prompt)
--- a/eval_framework/src/llm_client.py
+++ b/eval_framework/src/llm_client.py
@@ -48,14 +48,27 @@ class LLMClient:
        retries = 0
        while retries < self.max_retries:
            try:
-                response = self.client.chat.completions.create(
-                    model=self.model,
-                    messages=[
+                if system_prompt == 'None':
+                    messages = [
+                        {"role": "user", "content": user_input}
+                    ]
+                else:
+                    messages = [
                        {"role": "system", "content": system_prompt},
                        {"role": "user", "content": user_input}
-                    ],
-                    temperature=self.temperature
-                )
+                    ]
+
+                if self.temperature == -1:
+                    response = self.client.chat.completions.create(
+                        model=self.model,
+                        messages=messages,
+                    )
+                else:
+                    response = self.client.chat.completions.create(
+                        model=self.model,
+                        messages=messages,
+                        temperature=self.temperature
+                    )
                answer = response.choices[0].message.content
                return answer
                
--- a/layer2/PGEE/code/stepy_complete_choice_questions.json
+++ b/layer2/PGEE/code/stepy_complete_choice_questions.json
--- a/layer2/PGEE/code/stepy_gen_option.py
+++ b/layer2/PGEE/code/stepy_gen_option.py
@@ -0,0 +1,616 @@
+import json
+import openai
+from typing import Dict, Any, List
+import time
+import logging
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+from tqdm import tqdm
+import random
+import re
+
+# 配置日志
+logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
+
+class ChoiceOptionsGenerator:
+    def __init__(self, api_key: str, base_url: str, model_name: str, max_workers: int = 20):
+        self.api_key = api_key
+        self.base_url = base_url
+        self.model_name = model_name
+        self.max_workers = max_workers
+        self.thread_local = threading.local()
+        self.lock = threading.Lock()
+        self.max_retries = 5  # 最大重试次数
+        
+    def get_client(self):
+        if not hasattr(self.thread_local, 'client'):
+            self.thread_local.client = openai.OpenAI(
+                api_key=self.api_key,
+                base_url=self.base_url
+            )
+        return self.thread_local.client
+    
+    def create_options_prompt(self, question_data: Dict[str, Any]) -> str:
+        """创建生成选项的提示词"""
+        choice_question = question_data.get("choice_question", "")
+        correct_option = question_data.get("correct_option", "")
+        original_question = question_data.get("question", "")
+        question_type = question_data.get("question_type", "")
+        
+        # 根据question_type判断题目类型
+        if question_type == "true_false":
+            return self._create_true_false_prompt(choice_question, correct_option, original_question)
+        else:
+            # 其他所有类型都生成选择题
+            return self._create_multiple_choice_prompt(choice_question, correct_option, original_question, question_type)
+    
+    def _create_true_false_prompt(self, question: str, correct_option: str, original_question: str) -> str:
+        """创建判断题的提示词"""
+        return f"""
+请为以下判断题生成完整的题目格式。
+
+题目: {question}
+正确答案: {correct_option}
+原始题目: {original_question}
+
+请按以下要求输出判断题：
+1. 将题目转换为一个明确的陈述句
+2. 要求学生判断该陈述的正确性
+3. 提供标准的判断题格式
+
+输出格式（严格按照JSON格式）：
+{{
+    "question_type": "true_false",
+    "statement": "需要判断的陈述句",
+    "options": ["True", "False"],
+    "correct_answer": "True/False",
+    "explanation": "答案解释"
+}}
+"""
+
+    def _create_multiple_choice_prompt(self, question: str, correct_option: str, original_question: str, question_type: str) -> str:
+        """创建选择题的提示词 - 采用两步法"""
+        return f"""
+你是一个材料科学专业的教育评估专家。请为以下题目生成高质量的选择题选项。
+
+题目: {question}
+正确答案: {correct_option}
+原始题目: {original_question}
+题目类型: {question_type}
+
+**请分两步完成：**
+
+**第一步：打草稿 - 生成10个候选干扰项**
+请先分析题目，然后生成10个可能的干扰项。要求：
+- 与正确答案在同一知识领域和格式
+- 涵盖不同的错误类型（概念混淆、数值错误、逻辑错误、计算过程中间值等）
+- 干扰项首先要把你自己都成功干扰，然后才能干扰学生，否则就没有意义了
+- 包含至少7个高难度干扰项（需要深入理解才能排除）
+- 包含1-2个中等难度干扰项
+- 包含1-2个相对简单的干扰项
+
+**第二步：精选最佳选项**
+从10个候选项中选择3个最佳干扰项，要求：
+- 至少2个是高难度干扰项（专业人士也可能犯错）
+- 避免明显错误或不合理的选项（没有意义，这些学生都是清华北大的顶级学生，考察他们需要相当大的难度才行）
+- 确保每个选项都有相当大的迷惑性（首先要能干扰你自己）
+
+**输出格式（严格按照JSON格式）：**
+{{
+    "draft_analysis": {{
+        "question_analysis": "题目分析和知识点识别",
+        "correct_answer_analysis": "正确答案的原理解释",
+        "distractor_strategy": "干扰项设计策略"
+    }},
+    "candidate_distractors": [
+        {{"option": "候选干扰项1", "difficulty": "high/medium/low", "reasoning": "设计理由"}},
+        {{"option": "候选干扰项2", "difficulty": "high/medium/low", "reasoning": "设计理由"}},
+        // ... 总共10个候选项
+    ],
+    "final_selection": {{
+        "question_type": "multiple_choice",
+        "options": {{
+            "A": "选项A内容",
+            "B": "选项B内容", 
+            "C": "选项C内容",
+            "D": "选项D内容"
+        }},
+        "correct_answer": "A/B/C/D",
+        "difficulty_distribution": {{
+            "high_difficulty_count": 2,
+            "medium_difficulty_count": 1,
+            "selected_distractors_reasoning": "为什么选择这3个干扰项的详细说明"
+        }},
+        "explanation": "正确答案解释及其他选项错误原因分析"
+    }}
+}}
+
+**重要要求：**
+1. 确保至少2个干扰项具有高度迷惑性，即使是专业人士也需要仔细思考才能排除，最低限度的迷惑度是骗过你自己
+2. 所有干扰项必须在学术上是合理的概念，不能是胡编乱造
+3. 正确答案位置要随机分布，不要总是放在A选项、B选项、C选项或D选项
+4. 每个干扰项都要有明确的设计理由和难度评估
+"""
+
+    def generate_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
+        """为单个题目生成选项，带重试机制"""
+        for attempt in range(self.max_retries):
+            try:
+                result = self._attempt_generate_options(question_data)
+                
+                # 验证结果质量
+                if self._validate_options_quality(result, question_data):
+                    return result
+                else:
+                    if attempt < self.max_retries - 1:
+                        logging.warning(f"第{attempt+1}次生成的选项质量不佳，重试中...")
+                        time.sleep(1)  # 短暂延迟后重试
+                    continue
+                    
+            except Exception as e:
+                logging.error(f"第{attempt+1}次生成选项失败: {e}")
+                if attempt < self.max_retries - 1:
+                    time.sleep(2)  # 失败后延迟重试
+                    continue
+        
+        # 所有重试都失败，返回备用选项
+        logging.error("所有重试都失败，使用备用选项生成")
+        return self._create_fallback_options(question_data)
+    
+    def _attempt_generate_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
+        """单次尝试生成选项"""
+        client = self.get_client()
+        prompt = self.create_options_prompt(question_data)
+        
+        response = client.chat.completions.create(
+            model=self.model_name,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "你是一个材料科学专业的教育评估专家，具有丰富的出题经验。你特别擅长设计高质量的干扰项，能够创造出既合理又具有高度迷惑性的选项。请严格按照要求的JSON格式输出。"
+                },
+                {"role": "user", "content": prompt}
+            ],
+            temperature=0.8,  # 提高温度增加创造性
+            max_tokens=1500,  # 增加token数以支持更详细的分析
+            top_p=0.9
+        )
+        
+        result_text = response.choices[0].message.content.strip()
+        
+        # 解析JSON结果
+        json_result = self._extract_json_from_response(result_text)
+        
+        # 如果是判断题，直接返回
+        if json_result.get("question_type") == "true_false":
+            return json_result
+        
+        # 如果是选择题，提取final_selection部分
+        if "final_selection" in json_result:
+            return json_result["final_selection"]
+        else:
+            return json_result
+    
+    def _extract_json_from_response(self, response_text: str) -> Dict[str, Any]:
+        """从响应文本中提取JSON"""
+        # 寻找JSON内容
+        json_start = response_text.find('{')
+        json_end = response_text.rfind('}') + 1
+        
+        if json_start == -1 or json_end <= json_start:
+            raise ValueError("无法在响应中找到JSON格式内容")
+        
+        json_str = response_text[json_start:json_end]
+        
+        # 尝试解析JSON
+        try:
+            return json.loads(json_str)
+        except json.JSONDecodeError as e:
+            # 尝试修复常见的JSON错误
+            json_str = self._fix_common_json_errors(json_str)
+            return json.loads(json_str)
+    
+    def _fix_common_json_errors(self, json_str: str) -> str:
+        """修复常见的JSON格式错误"""
+        # 移除注释
+        json_str = re.sub(r'//.*', '', json_str)
+        
+        # 修复尾随逗号
+        json_str = re.sub(r',\s*}', '}', json_str)
+        json_str = re.sub(r',\s*]', ']', json_str)
+        
+        # 确保字符串被正确引用
+        # 这里可以添加更多修复逻辑
+        
+        return json_str
+    
+    def _validate_options_quality(self, result: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
+        """验证生成选项的质量"""
+        if not result:
+            return False
+        
+        question_type = result.get("question_type", "")
+        
+        if question_type == "true_false":
+            return self._validate_true_false_quality(result)
+        elif question_type == "multiple_choice":
+            return self._validate_multiple_choice_quality(result, original_data)
+        
+        return False
+    
+    def _validate_true_false_quality(self, result: Dict[str, Any]) -> bool:
+        """验证判断题质量"""
+        required_fields = ["statement", "options", "correct_answer", "explanation"]
+        
+        # 检查必需字段
+        if not all(field in result for field in required_fields):
+            return False
+        
+        # 检查选项是否为True/False
+        options = result.get("options", [])
+        if not (len(options) == 2 and "True" in options and "False" in options):
+            return False
+        
+        # 检查正确答案是否有效
+        correct_answer = result.get("correct_answer", "")
+        if correct_answer not in ["True", "False"]:
+            return False
+        
+        return True
+    
+    def _validate_multiple_choice_quality(self, result: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
+        """验证选择题质量"""
+        # 检查基本结构
+        if not all(key in result for key in ["options", "correct_answer", "explanation"]):
+            return False
+        
+        options = result.get("options", {})
+        
+        # 检查是否有4个选项
+        if len(options) != 4 or not all(label in options for label in ["A", "B", "C", "D"]):
+            return False
+        
+        # 检查正确答案是否有效
+        correct_answer = result.get("correct_answer", "")
+        if correct_answer not in ["A", "B", "C", "D"]:
+            return False
+        
+        # 检查是否包含原始正确答案（放宽检查条件）
+        original_correct = original_data.get("correct_option", "").strip()
+        if original_correct:
+            # 检查是否有选项包含或相似于原始正确答案
+            found_match = False
+            for option in options.values():
+                option_str = str(option).strip()
+                # 检查完全包含或高度相似
+                if (original_correct.lower() in option_str.lower() or 
+                    option_str.lower() in original_correct.lower() or
+                    self._are_similar_answers(original_correct, option_str)):
+                    found_match = True
+                    break
+            
+            if not found_match:
+                logging.warning(f"未找到匹配的原始答案: {original_correct}")
+                return False
+        
+        # 检查选项长度（避免过短的选项）
+        if any(len(str(option).strip()) < 2 for option in options.values()):
+            return False
+        
+        # 检查选项是否有重复
+        option_values = [str(option).strip().lower() for option in options.values()]
+        if len(set(option_values)) != 4:
+            return False
+        
+        return True
+    
+    def _are_similar_answers(self, answer1: str, answer2: str) -> bool:
+        """检查两个答案是否相似"""
+        # 简单的相似度检查，可以根据需要扩展
+        answer1_clean = re.sub(r'[^\w\s]', '', answer1.lower()).strip()
+        answer2_clean = re.sub(r'[^\w\s]', '', answer2.lower()).strip()
+        
+        # 检查关键词重叠
+        words1 = set(answer1_clean.split())
+        words2 = set(answer2_clean.split())
+        
+        if len(words1) == 0 or len(words2) == 0:
+            return False
+        
+        overlap = len(words1.intersection(words2))
+        similarity = overlap / min(len(words1), len(words2))
+        
+        return similarity > 0.6  # 60%相似度阈值
+    
+    def _create_fallback_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
+        """当AI生成失败时的备用选项生成"""
+        question_type = question_data.get("question_type", "")
+        correct_option = question_data.get("correct_option", "")
+        
+        # 根据question_type生成相应的备用选项
+        if question_type == "true_false":
+            return {
+                "question_type": "true_false",
+                "statement": question_data.get("choice_question", ""),
+                "options": ["True", "False"],
+                "correct_answer": self._determine_true_false_answer(correct_option),
+                "explanation": "基于题目分析的判断结果"
+            }
+        else:
+            # 其他类型都生成选择题
+            distractors = self._generate_rule_based_distractors(correct_option)
+            all_options = [correct_option] + distractors
+            random.shuffle(all_options)
+            
+            # 找到正确答案的位置
+            correct_index = all_options.index(correct_option)
+            correct_label = ["A", "B", "C", "D"][correct_index]
+            
+            return {
+                "question_type": "multiple_choice",
+                "options": {
+                    "A": all_options[0],
+                    "B": all_options[1],
+                    "C": all_options[2],
+                    "D": all_options[3]
+                },
+                "correct_answer": correct_label,
+                "explanation": "基于规则生成的备用选项",
+                "generated_by": "fallback_rules"
+            }
+    
+    def _determine_true_false_answer(self, correct_option: str) -> str:
+        """确定判断题的正确答案"""
+        true_indicators = ["true", "正确", "是", "对", "T", "√", "yes"]
+        false_indicators = ["false", "错误", "否", "错", "F", "×", "no"]
+        
+        correct_lower = correct_option.lower().strip()
+        
+        # 检查是否包含True相关的词汇
+        if any(indicator in correct_lower for indicator in true_indicators):
+            return "True"
+        # 检查是否包含False相关的词汇
+        elif any(indicator in correct_lower for indicator in false_indicators):
+            return "False"
+        else:
+            # 如果无法确定，默认返回True
+            return "True"
+    
+    def _generate_rule_based_distractors(self, correct_answer: str) -> List[str]:
+        """基于规则生成干扰项"""
+        distractors = []
+        
+        # 尝试识别答案类型并生成相应的干扰项
+        if self._is_numeric_answer(correct_answer):
+            distractors = self._generate_numeric_distractors(correct_answer)
+        elif self._is_structure_name(correct_answer):
+            distractors = self._generate_structure_distractors(correct_answer)
+        elif self._is_material_property(correct_answer):
+            distractors = self._generate_property_distractors(correct_answer)
+        else:
+            # 通用干扰项
+            distractors = [
+                f"Alternative option 1",
+                f"Alternative option 2", 
+                f"Alternative option 3"
+            ]
+        
+        # 确保返回3个干扰项
+        return distractors[:3]
+    
+    def _is_numeric_answer(self, answer: str) -> bool:
+        """检查答案是否为数值型"""
+        return bool(re.search(r'\d+\.?\d*', answer))
+    
+    def _is_structure_name(self, answer: str) -> bool:
+        """检查答案是否为结构名称"""
+        structure_keywords = ["cubic", "hexagonal", "tetragonal", "orthorhombic", "bcc", "fcc", "hcp"]
+        return any(keyword in answer.lower() for keyword in structure_keywords)
+    
+    def _is_material_property(self, answer: str) -> bool:
+        """检查答案是否为材料属性"""
+        property_keywords = ["strength", "hardness", "ductility", "brittleness", "conductivity", "elastic"]
+        return any(keyword in answer.lower() for keyword in property_keywords)
+    
+    def _generate_numeric_distractors(self, correct_answer: str) -> List[str]:
+        """生成数值型干扰项"""
+        # 提取数值
+        numbers = re.findall(r'\d+\.?\d*', correct_answer)
+        if not numbers:
+            return ["Option B", "Option C", "Option D"]
+        
+        base_num = float(numbers[0])
+        unit = correct_answer.replace(numbers[0], "").strip()
+        
+        distractors = [
+            f"{base_num * 0.5:.2f} {unit}".strip(),
+            f"{base_num * 2:.2f} {unit}".strip(),
+            f"{base_num * 1.5:.2f} {unit}".strip()
+        ]
+        
+        return distractors
+    
+    def _generate_structure_distractors(self, correct_answer: str) -> List[str]:
+        """生成结构名称型干扰项"""
+        all_structures = [
+            "simple cubic", "body-centered cubic", "face-centered cubic",
+            "hexagonal close-packed", "diamond cubic", "tetragonal",
+            "orthorhombic", "monoclinic", "triclinic"
+        ]
+        
+        distractors = [s for s in all_structures if s.lower() != correct_answer.lower()]
+        return random.sample(distractors, min(3, len(distractors)))
+    
+    def _generate_property_distractors(self, correct_answer: str) -> List[str]:
+        """生成材料属性型干扰项"""
+        all_properties = [
+            "high strength", "low strength", "high ductility", "brittleness",
+            "high hardness", "low hardness", "high toughness", "low toughness",
+            "high elasticity", "low elasticity", "high conductivity", "low conductivity"
+        ]
+        
+        distractors = [p for p in all_properties if p.lower() != correct_answer.lower()]
+        return random.sample(distractors, min(3, len(distractors)))
+
+def process_single_question(generator, question, question_index):
+    """处理单个题目的函数"""
+    try:
+        # 生成选项
+        options_data = generator.generate_options(question)
+        
+        # 合并到原题目数据
+        complete_question = question.copy()
+        complete_question["generated_options"] = options_data
+        complete_question["generation_status"] = "success"
+        complete_question["question_index"] = question_index  # 保持原始顺序
+        
+        return complete_question
+        
+    except Exception as e:
+        logging.error(f"第{question_index+1}题处理失败: {e}")
+        
+        # 添加失败标记
+        failed_question = question.copy()
+        failed_question["generated_options"] = generator._create_fallback_options(question)
+        failed_question["generation_status"] = "failed"
+        failed_question["error_message"] = str(e)
+        failed_question["question_index"] = question_index
+        
+        return failed_question
+
+def main():
+    # 配置信息
+    API_KEY = "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+    BASE_URL = "https://vip.apiyi.com/v1"
+    MODEL_NAME = "deepseek-chat"
+    # MODEL_NAME = "claude-sonnet-4-20250514"
+    MAX_WORKERS = 20  # 线程数
+    
+    INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepx_filtered_high_quality_questions.json"
+    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions.json"
+    
+    # 加载数据
+    print("正在加载数据...")
+    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
+        questions = json.load(f)
+    import random
+    random.shuffle(questions)  # 打乱题目顺序，增加多样性
+    questions = questions[:200]  # 调试期间只处理前200道题目  
+
+    print(f"加载了 {len(questions)} 道题目")
+    
+    # 统计题目类型分布
+    type_counts = {}
+    for q in questions:
+        qtype = q.get("question_type", "unknown")
+        type_counts[qtype] = type_counts.get(qtype, 0) + 1
+    
+    print("题目类型分布:")
+    for qtype, count in type_counts.items():
+        print(f"  {qtype}: {count} 道")
+    
+    # 初始化生成器
+    generator = ChoiceOptionsGenerator(API_KEY, BASE_URL, MODEL_NAME, MAX_WORKERS)
+    
+    # 多线程处理题目
+    complete_questions = []
+    processed_count = 0
+    
+    # 使用ThreadPoolExecutor进行并发处理
+    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
+        # 提交所有任务
+        future_to_question = {
+            executor.submit(process_single_question, generator, question, i): (question, i) 
+            for i, question in enumerate(questions)
+        }
+        
+        # 使用tqdm显示进度
+        with tqdm(total=len(questions), desc="生成选项") as pbar:
+            # 收集结果
+            temp_results = []
+            
+            for future in as_completed(future_to_question):
+                try:
+                    result = future.result()
+                    temp_results.append(result)
+                    processed_count += 1
+                    
+                    pbar.update(1)
+                    
+                    # # 每处理100题保存一次中间结果
+                    # if processed_count % 100 == 0:
+                    #     # 按原始顺序排序临时结果
+                    #     temp_results_sorted = sorted(temp_results, key=lambda x: x.get("question_index", 0))
+                        
+                    #     temp_file = OUTPUT_FILE.replace('.json', f'_temp_{processed_count}.json')
+                    #     with open(temp_file, 'w', encoding='utf-8') as f:
+                    #         json.dump(temp_results_sorted, f, ensure_ascii=False, indent=2)
+                    #     print(f"\n已保存中间结果到 {temp_file}")
+                    
+                except Exception as e:
+                    logging.error(f"处理结果时发生错误: {e}")
+                    original_question, question_index = future_to_question[future]
+                    
+                    # 创建失败结果
+                    failed_result = original_question.copy()
+                    failed_result["generated_options"] = generator._create_fallback_options(original_question)
+                    failed_result["generation_status"] = "processing_failed"
+                    failed_result["error_message"] = str(e)
+                    failed_result["question_index"] = question_index
+                    
+                    temp_results.append(failed_result)
+                    processed_count += 1
+                    pbar.update(1)
+    
+    # 按原始顺序排序结果
+    complete_questions = sorted(temp_results, key=lambda x: x.get("question_index", 0))
+    
+    # 移除临时的索引字段
+    for question in complete_questions:
+        if "question_index" in question:
+            del question["question_index"]
+    
+    # 保存最终结果
+    with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
+        json.dump(complete_questions, f, ensure_ascii=False, indent=2)
+    
+    # 统计结果
+    success_count = sum(1 for q in complete_questions if q.get("generation_status") == "success")
+    failed_count = len(complete_questions) - success_count
+    
+    print(f"\n完成！总共处理了 {len(complete_questions)} 道题目")
+    print(f"成功生成: {success_count} 道")
+    print(f"使用备用方案: {failed_count} 道")
+    print(f"成功率: {success_count/len(complete_questions)*100:.2f}%")
+    print(f"结果已保存到: {OUTPUT_FILE}")
+    
+    # 按题目类型统计结果
+    type_success = {}
+    type_total = {}
+    for q in complete_questions:
+        qtype = q.get("question_type", "unknown")
+        type_total[qtype] = type_total.get(qtype, 0) + 1
+        if q.get("generation_status") == "success":
+            type_success[qtype] = type_success.get(qtype, 0) + 1
+    
+    print("\n各题型处理结果:")
+    for qtype in type_total:
+        success = type_success.get(qtype, 0)
+        total = type_total[qtype]
+        success_rate = success / total * 100 if total > 0 else 0
+        print(f"  {qtype}: {success}/{total} ({success_rate:.1f}%)")
+    
+    # 详细的失败统计
+    if failed_count > 0:
+        failure_reasons = {}
+        for q in complete_questions:
+            if q.get("generation_status") != "success":
+                reason = q.get("error_message", "未知错误")
+                failure_reasons[reason] = failure_reasons.get(reason, 0) + 1
+        
+        print("\n失败原因统计：")
+        for reason, count in failure_reasons.items():
+            print(f"  {reason}: {count} 道题")
+
+if __name__ == "__main__":
+    main()
--- a/layer2/PGEE/code/stepz_final_choice_questions.json
+++ b/layer2/PGEE/code/stepz_final_choice_questions.json
--- a/layer2/PGEE/code/stepz_final_format_convert.py
+++ b/layer2/PGEE/code/stepz_final_format_convert.py
@@ -0,0 +1,244 @@
+import json
+from typing import Dict, Any, List, Optional
+
+def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
+    """
+    将源JSON格式转换为目标格式
+    
+    Args:
+        source_data: 源数据字典
+        
+    Returns:
+        转换后的数据字典，如果不是单选题则返回None
+    """
+    # 检查是否有generated_options字段
+    if "generated_options" not in source_data:
+        return None
+    
+    generated_options = source_data["generated_options"]
+    
+    # 只处理单选题，跳过判断题
+    if generated_options.get("question_type") != "multiple_choice":
+        return None
+    
+    # 获取题目内容
+    question = source_data.get("choice_question", "")
+    if not question:
+        return None
+    
+    # 获取选项
+    options = generated_options.get("options", {})
+    if len(options) != 4:
+        return None
+    
+    # 获取正确答案
+    correct_answer = generated_options.get("correct_answer", "")
+    if correct_answer not in ["A", "B", "C", "D"]:
+        return None
+    
+    # 构建目标格式
+    target_data = {
+        "question": question,
+        "choices": {
+            "text": [
+                options.get("A", ""),
+                options.get("B", ""),
+                options.get("C", ""),
+                options.get("D", "")
+            ],
+            "label": ["A", "B", "C", "D"]
+        },
+        "answer": f"[ANSWER]{correct_answer}[/ANSWER]",
+        "prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
+    }
+    
+    return target_data
+
+def batch_convert_questions(input_file: str, output_file: str) -> None:
+    """
+    批量转换题目格式
+    
+    Args:
+        input_file: 输入文件路径
+        output_file: 输出文件路径
+    """
+    print("正在加载数据...")
+    with open(input_file, 'r', encoding='utf-8') as f:
+        source_questions = json.load(f)
+    
+    print(f"加载了 {len(source_questions)} 道题目")
+    
+    converted_questions = []
+    conversion_stats = {
+        "total": len(source_questions),
+        "multiple_choice": 0,
+        "true_false": 0,
+        "other": 0,
+        "converted": 0,
+        "failed": 0
+    }
+    
+    for i, question in enumerate(source_questions):
+        try:
+            # 统计题目类型
+            generated_options = question.get("generated_options", {})
+            question_type = generated_options.get("question_type", "unknown")
+            
+            if question_type == "multiple_choice":
+                conversion_stats["multiple_choice"] += 1
+            elif question_type == "true_false":
+                conversion_stats["true_false"] += 1
+            else:
+                conversion_stats["other"] += 1
+            
+            # 转换题目
+            converted = convert_to_target_format(question)
+            if converted:
+                converted_questions.append(converted)
+                conversion_stats["converted"] += 1
+            else:
+                conversion_stats["failed"] += 1
+                
+        except Exception as e:
+            print(f"第{i+1}题转换失败: {e}")
+            conversion_stats["failed"] += 1
+    
+    # 保存结果
+    print("正在保存转换结果...")
+    with open(output_file, 'w', encoding='utf-8') as f:
+        json.dump(converted_questions, f, ensure_ascii=False, indent=2)
+    
+    # 打印统计信息
+    print(f"\n转换完成！")
+    print(f"总题目数: {conversion_stats['total']}")
+    print(f"单选题: {conversion_stats['multiple_choice']}")
+    print(f"判断题: {conversion_stats['true_false']}")
+    print(f"其他类型: {conversion_stats['other']}")
+    print(f"成功转换: {conversion_stats['converted']}")
+    print(f"转换失败: {conversion_stats['failed']}")
+    print(f"转换率: {conversion_stats['converted']/conversion_stats['total']*100:.1f}%")
+    print(f"结果已保存到: {output_file}")
+
+def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, int]:
+    """
+    验证转换后的题目格式
+    
+    Args:
+        questions: 转换后的题目列表
+        
+    Returns:
+        验证统计信息
+    """
+    stats = {
+        "total": len(questions),
+        "valid": 0,
+        "invalid": 0,
+        "missing_question": 0,
+        "invalid_choices": 0,
+        "invalid_answer": 0
+    }
+    
+    for i, q in enumerate(questions):
+        is_valid = True
+        
+        # 检查question字段
+        if not q.get("question", "").strip():
+            stats["missing_question"] += 1
+            is_valid = False
+        
+        # 检查choices字段
+        choices = q.get("choices", {})
+        text_list = choices.get("text", [])
+        label_list = choices.get("label", [])
+        
+        if (len(text_list) != 4 or len(label_list) != 4 or 
+            label_list != ["A", "B", "C", "D"] or
+            any(not str(text).strip() for text in text_list)):
+            stats["invalid_choices"] += 1
+            is_valid = False
+        
+        # 检查answer字段
+        answer = q.get("answer", "")
+        if not (answer.startswith("[ANSWER]") and answer.endswith("[/ANSWER]") and
+                answer[8:-9] in ["A", "B", "C", "D"]):
+            stats["invalid_answer"] += 1
+            is_valid = False
+        
+        if is_valid:
+            stats["valid"] += 1
+        else:
+            stats["invalid"] += 1
+            print(f"第{i+1}题格式无效")
+    
+    return stats
+
+def main():
+    """主函数"""
+    # 文件路径配置
+    INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions.json"
+    OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
+    
+    try:
+        # 批量转换
+        batch_convert_questions(INPUT_FILE, OUTPUT_FILE)
+        
+        # 验证转换结果
+        print("\n正在验证转换结果...")
+        with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
+            converted_questions = json.load(f)
+        
+        validation_stats = validate_converted_questions(converted_questions)
+        
+        print(f"\n验证结果:")
+        print(f"总题目数: {validation_stats['total']}")
+        print(f"格式正确: {validation_stats['valid']}")
+        print(f"格式错误: {validation_stats['invalid']}")
+        
+        if validation_stats['invalid'] > 0:
+            print(f"  缺少题目: {validation_stats['missing_question']}")
+            print(f"  选项格式错误: {validation_stats['invalid_choices']}")
+            print(f"  答案格式错误: {validation_stats['invalid_answer']}")
+        
+        print(f"格式正确率: {validation_stats['valid']/validation_stats['total']*100:.1f}%")
+        
+    except Exception as e:
+        print(f"程序执行失败: {e}")
+
+def test_single_conversion():
+    """测试单个题目转换"""
+    # 测试数据
+    test_data = {
+        "idx": 3154,
+        "question": "In stable ZrO2 material, cations form an fcc structure, and anions occupy tetrahedral interstitial sites. If 20 mol% CaO is added, calculate the percentage of occupied tetrahedral interstitial sites.",
+        "answer": "Zr4+ and Ca2+ cations occupy the face-centered cubic lattice sites. 100 cations can form 25 unit cells, with a total of 25×8=200 tetrahedral interstitial sites. Therefore, the percentage of occupied tetrahedral interstitial sites is 180÷200=90%.",
+        "question_type": "calculation",
+        "correct_option": "90%",
+        "choice_question": "In stable ZrO2 material, cations form an fcc structure, and anions occupy tetrahedral interstitial sites. If 20 mol% CaO is added, what is the percentage of occupied tetrahedral interstitial sites?",
+        "generated_options": {
+            "question_type": "multiple_choice",
+            "options": {
+                "A": "80%",
+                "B": "90%",
+                "C": "50%",
+                "D": "75%"
+            },
+            "correct_answer": "B",
+            "explanation": "正确答案90%基于：1) fcc中四面体间隙数量是阳离子的2倍；2) 20 mol% CaO掺杂产生20%氧空位；3) 被占据间隙位比例=(原始占据数-空位数)/总间隙位数。"
+        },
+        "generation_status": "success"
+    }
+    
+    # 测试转换
+    result = convert_to_target_format(test_data)
+    if result:
+        print("转换成功！")
+        print(json.dumps(result, ensure_ascii=False, indent=2))
+    else:
+        print("转换失败！")
+
+if __name__ == "__main__":
+    # 可以先运行测试
+    # test_single_conversion()
+    
+    # 运行主程序
+    main()