diff --git a/eval_framework/config/config.yaml b/eval_framework/config/config.yaml
new file mode 100644
index 0000000..8ef7f91
--- /dev/null
+++ b/eval_framework/config/config.yaml
@@ -0,0 +1,36 @@
+# API配置
+api:
+  key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+  base_url: "https://vip.apiyi.com/v1"
+  temperature: 0
+  max_retries: 10
+  # 支持多个模型
+  models:
+    - "qwen-max-2025-01-25"
+    - "gpt-4o"
+  # 或者使用单个模型（向后兼容）
+  # model: "qwen-max-2025-01-25"
+
+# 系统提示词
+system_prompt: "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."
+
+# 评估配置
+evaluation:
+  max_workers: 8
+  input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
+  # 输出配置
+  output:
+    base_dir: "results"
+    auto_timestamp: true
+    filename_template: "{model}.json"
+    summary_filename: "summary.json"
+    # 输出格式选项
+    export_formats:
+      - "json"    # 详细JSON结果
+      - "csv"     # CSV表格
+      - "excel"   # Excel表格（需要openpyxl）
+
+# 日志配置
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/eval_framework/main.py b/eval_framework/main.py
new file mode 100644
index 0000000..6c7bf2b
--- /dev/null
+++ b/eval_framework/main.py
@@ -0,0 +1,164 @@
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, Any
+
+from src import (
+    DataLoader, LLMClient, Evaluator, 
+    load_config, save_results, save_metrics, save_summary,
+    setup_logging, print_metrics, print_summary,
+    get_models_from_config, generate_output_dir, generate_model_output_path
+)
+
+logger = logging.getLogger(__name__)
+
+def evaluate_single_model(
+    model_name: str, 
+    data: list, 
+    config: Dict[str, Any], 
+    output_dir: str
+) -> Dict[str, Any]:
+    """
+    评估单个模型
+    
+    Args:
+        model_name: 模型名称
+        data: 评估数据
+        config: 配置字典
+        output_dir: 输出目录
+        
+    Returns:
+        包含指标和结果的字典
+    """
+    logger.info(f"Starting evaluation for model: {model_name}")
+    
+    # 初始化LLM客户端
+    llm_client = LLMClient(
+        api_key=config['api']['key'],
+        base_url=config['api']['base_url'],
+        model=model_name,
+        temperature=config['api']['temperature'],
+        max_retries=config['api']['max_retries']
+    )
+    
+    # 初始化评估器
+    evaluator = Evaluator(
+        llm_client=llm_client,
+        system_prompt=config['system_prompt']
+    )
+    
+    # 执行评估
+    max_workers = config['evaluation']['max_workers']
+    metrics, results = evaluator.evaluate(data, max_workers=max_workers)
+    
+    # 生成输出文件路径
+    filename_template = config['evaluation']['output']['filename_template']
+    output_file = generate_model_output_path(output_dir, model_name, filename_template)
+    
+    # 保存结果和指标
+    save_results(results, output_file)
+    save_metrics(metrics, output_file)
+    
+    logger.info(f"Model {model_name} evaluation completed. Results saved to {output_file}")
+    
+    return {
+        "metrics": metrics,
+        "results": results,
+        "output_file": output_file
+    }
+
+def main():
+    parser = argparse.ArgumentParser(description="材料科学LLM评估框架")
+    parser.add_argument("--config", default="eval_framework/config/config.yaml", help="配置文件路径")
+    parser.add_argument("--input", help="输入数据文件路径（覆盖配置文件）")
+    parser.add_argument("--output-dir", help="输出目录路径（覆盖配置文件）")
+    parser.add_argument("--workers", type=int, help="工作线程数（覆盖配置文件）")
+    parser.add_argument("--models", nargs="+", help="指定要评估的模型列表（覆盖配置文件）")
+    parser.add_argument("--no-timestamp", action="store_true", help="不使用时间戳文件夹")
+    
+    args = parser.parse_args()
+    
+    # 加载配置
+    config = load_config(args.config)
+    
+    # 如果指定了不使用时间戳，修改配置
+    if args.no_timestamp:
+        config['evaluation']['output']['auto_timestamp'] = False
+    
+    # 设置日志
+    setup_logging(
+        level=config.get('logging', {}).get('level', 'INFO'),
+        format_str=config.get('logging', {}).get('format')
+    )
+    
+    logger.info("Starting multi-model evaluation framework")
+    
+    # 处理输入路径和工作线程数
+    input_file = args.input or config['evaluation']['input_file']
+    if args.workers:
+        config['evaluation']['max_workers'] = args.workers
+    
+    # 获取模型列表
+    if args.models:
+        models = args.models
+        logger.info(f"Using models from command line: {models}")
+    else:
+        models = get_models_from_config(config)
+        logger.info(f"Using models from config: {models}")
+    
+    # 生成输出目录
+    if args.output_dir:
+        output_dir = args.output_dir
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+    else:
+        output_dir = generate_output_dir(config)
+    
+    logger.info(f"Output directory: {output_dir}")
+    
+    try:
+        # 加载数据
+        logger.info(f"Loading data from {input_file}")
+        data = DataLoader.load_and_validate_data(input_file)
+        
+        if not data:
+            logger.error("No valid data found")
+            return
+        
+        logger.info(f"Loaded {len(data)} valid data items")
+        
+        # 存储所有模型的结果
+        all_results = {}
+        
+        # 逐个评估模型
+        for i, model_name in enumerate(models, 1):
+            logger.info(f"Evaluating model {i}/{len(models)}: {model_name}")
+            
+            try:
+                model_result = evaluate_single_model(model_name, data[:10], config, output_dir)
+                all_results[model_name] = model_result
+                
+                # 打印当前模型的结果
+                print_metrics(model_result["metrics"], model_name)
+                
+            except Exception as e:
+                logger.error(f"Failed to evaluate model {model_name}: {e}")
+                continue
+        
+        # 保存汇总结果
+        if all_results:
+            summary_filename = config['evaluation']['output']['summary_filename']
+            save_summary(all_results, output_dir, summary_filename)
+            
+            # 打印汇总对比
+            print_summary(all_results)
+            
+            logger.info(f"Summary saved to {Path(output_dir) / summary_filename}")
+        
+        logger.info("Multi-model evaluation completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Evaluation failed: {e}")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/eval_framework/src/__init__.py b/eval_framework/src/__init__.py
new file mode 100644
index 0000000..d534827
--- /dev/null
+++ b/eval_framework/src/__init__.py
@@ -0,0 +1,26 @@
+from .data_loader import DataLoader
+from .llm_client import LLMClient
+from .evaluator import Evaluator
+from .metrics import MetricsCalculator
+from .utils import (
+    load_config, save_results, save_metrics, save_summary,
+    setup_logging, print_metrics, print_summary,
+    get_models_from_config, generate_output_dir, generate_model_output_path
+)
+
+__all__ = [
+    'DataLoader',
+    'LLMClient', 
+    'Evaluator',
+    'MetricsCalculator',
+    'load_config',
+    'save_results',
+    'save_metrics',
+    'save_summary',
+    'setup_logging',
+    'print_metrics',
+    'print_summary',
+    'get_models_from_config',
+    'generate_output_dir',
+    'generate_model_output_path'
+]
diff --git a/eval_framework/src/__pycache__/__init__.cpython-311.pyc b/eval_framework/src/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000..430cb87
Binary files /dev/null and b/eval_framework/src/__pycache__/__init__.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/__init__.cpython-312.pyc b/eval_framework/src/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000..58a17be
Binary files /dev/null and b/eval_framework/src/__pycache__/__init__.cpython-312.pyc differ
diff --git a/eval_framework/src/__pycache__/data_loader.cpython-311.pyc b/eval_framework/src/__pycache__/data_loader.cpython-311.pyc
new file mode 100644
index 0000000..07ff9cb
Binary files /dev/null and b/eval_framework/src/__pycache__/data_loader.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/data_loader.cpython-312.pyc b/eval_framework/src/__pycache__/data_loader.cpython-312.pyc
new file mode 100644
index 0000000..5414c43
Binary files /dev/null and b/eval_framework/src/__pycache__/data_loader.cpython-312.pyc differ
diff --git a/eval_framework/src/__pycache__/evaluator.cpython-311.pyc b/eval_framework/src/__pycache__/evaluator.cpython-311.pyc
new file mode 100644
index 0000000..2ac9afd
Binary files /dev/null and b/eval_framework/src/__pycache__/evaluator.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/evaluator.cpython-312.pyc b/eval_framework/src/__pycache__/evaluator.cpython-312.pyc
new file mode 100644
index 0000000..b5e2a0e
Binary files /dev/null and b/eval_framework/src/__pycache__/evaluator.cpython-312.pyc differ
diff --git a/eval_framework/src/__pycache__/llm_client.cpython-311.pyc b/eval_framework/src/__pycache__/llm_client.cpython-311.pyc
new file mode 100644
index 0000000..ddffac6
Binary files /dev/null and b/eval_framework/src/__pycache__/llm_client.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/llm_client.cpython-312.pyc b/eval_framework/src/__pycache__/llm_client.cpython-312.pyc
new file mode 100644
index 0000000..5f212b7
Binary files /dev/null and b/eval_framework/src/__pycache__/llm_client.cpython-312.pyc differ
diff --git a/eval_framework/src/__pycache__/metrics.cpython-311.pyc b/eval_framework/src/__pycache__/metrics.cpython-311.pyc
new file mode 100644
index 0000000..4ba0133
Binary files /dev/null and b/eval_framework/src/__pycache__/metrics.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/metrics.cpython-312.pyc b/eval_framework/src/__pycache__/metrics.cpython-312.pyc
new file mode 100644
index 0000000..fd3282b
Binary files /dev/null and b/eval_framework/src/__pycache__/metrics.cpython-312.pyc differ
diff --git a/eval_framework/src/__pycache__/utils.cpython-311.pyc b/eval_framework/src/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000..40fd14f
Binary files /dev/null and b/eval_framework/src/__pycache__/utils.cpython-311.pyc differ
diff --git a/eval_framework/src/__pycache__/utils.cpython-312.pyc b/eval_framework/src/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000..487e507
Binary files /dev/null and b/eval_framework/src/__pycache__/utils.cpython-312.pyc differ
diff --git a/eval_framework/src/data_loader.py b/eval_framework/src/data_loader.py
new file mode 100644
index 0000000..9a97abe
--- /dev/null
+++ b/eval_framework/src/data_loader.py
@@ -0,0 +1,81 @@
+import json
+import logging
+from typing import List, Dict, Any
+
+logger = logging.getLogger(__name__)
+
+class DataLoader:
+    """数据加载器，负责加载和验证数据"""
+    
+    @staticmethod
+    def load_json_data(filepath: str) -> List[Dict[str, Any]]:
+        """
+        从JSON文件加载数据
+        
+        Args:
+            filepath: JSON文件路径
+            
+        Returns:
+            加载的数据列表
+            
+        Raises:
+            FileNotFoundError: 文件不存在
+            json.JSONDecodeError: JSON格式错误
+        """
+        try:
+            with open(filepath, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+            logger.info(f"Successfully loaded {len(data)} items from {filepath}")
+            return data
+        except FileNotFoundError:
+            logger.error(f"File not found: {filepath}")
+            raise
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON decode error in {filepath}: {e}")
+            raise
+    
+    @staticmethod
+    def validate_data_item(item: Dict[str, Any]) -> bool:
+        """
+        验证数据项是否包含必要字段
+        
+        Args:
+            item: 数据项
+            
+        Returns:
+            是否有效
+        """
+        required_fields = ['question', 'choices', 'answer', 'prompt']
+        for field in required_fields:
+            if field not in item:
+                logger.warning(f"Missing required field: {field}")
+                return False
+        
+        if 'text' not in item['choices'] or 'label' not in item['choices']:
+            logger.warning("Missing 'text' or 'label' in choices")
+            return False
+            
+        return True
+    
+    @classmethod
+    def load_and_validate_data(cls, filepath: str) -> List[Dict[str, Any]]:
+        """
+        加载并验证数据
+        
+        Args:
+            filepath: JSON文件路径
+            
+        Returns:
+            验证后的数据列表
+        """
+        data = cls.load_json_data(filepath)
+        valid_data = []
+        
+        for i, item in enumerate(data):
+            if cls.validate_data_item(item):
+                valid_data.append(item)
+            else:
+                logger.warning(f"Invalid data item at index {i}, skipping")
+        
+        logger.info(f"Validated {len(valid_data)} out of {len(data)} items")
+        return valid_data
diff --git a/eval_framework/src/evaluator.py b/eval_framework/src/evaluator.py
new file mode 100644
index 0000000..2d4254b
--- /dev/null
+++ b/eval_framework/src/evaluator.py
@@ -0,0 +1,98 @@
+import logging
+import concurrent.futures
+from typing import List, Dict, Any, Tuple
+from tqdm import tqdm
+
+from .llm_client import LLMClient
+from .metrics import MetricsCalculator
+
+logger = logging.getLogger(__name__)
+
+class Evaluator:
+    """评估器，协调整个评估流程"""
+    
+    def __init__(self, llm_client: LLMClient, system_prompt: str):
+        """
+        初始化评估器
+        
+        Args:
+            llm_client: LLM客户端
+            system_prompt: 系统提示词
+        """
+        self.llm_client = llm_client
+        self.system_prompt = system_prompt
+        self.metrics_calculator = MetricsCalculator()
+    
+    def process_item(self, item: Dict[str, Any], index: int) -> Dict[str, Any]:
+        """
+        处理单个数据项
+        
+        Args:
+            item: 数据项
+            index: 数据项索引
+            
+        Returns:
+            处理结果
+        """
+        question = item['question']
+        text = item['choices']['text']
+        label = item['choices']['label']
+        prompt = item['prompt']
+        expected_answer = item['answer'].strip()
+
+        # 格式化选择项
+        formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
+        user_input = f"{question} {formatted_choices}. {prompt}"
+        
+        # 获取LLM响应
+        llm_answer = self.llm_client.get_response(user_input, self.system_prompt)
+
+        return {
+            'index': index,
+            'question': question,
+            'choices': item['choices'],
+            'answer': expected_answer,
+            'llm_answer': llm_answer
+        }
+    
+    def evaluate(self, data: List[Dict[str, Any]], max_workers: int = 5) -> Tuple[Dict[str, float], List[Dict[str, Any]]]:
+        """
+        评估数据集
+        
+        Args:
+            data: 数据集
+            max_workers: 最大工作线程数
+            
+        Returns:
+            评估指标和详细结果
+        """
+        results = []
+
+        logger.info(f"Starting evaluation with {max_workers} workers")
+        
+        with tqdm(total=len(data), desc="Processing items") as pbar:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # 提交所有任务
+                future_to_index = {
+                    executor.submit(self.process_item, item, i): i 
+                    for i, item in enumerate(data)
+                }
+
+                # 收集结果
+                for future in concurrent.futures.as_completed(future_to_index):
+                    try:
+                        result = future.result()
+                        results.append(result)
+                        pbar.update(1)
+                    except Exception as e:
+                        logger.error(f"Error processing item: {e}")
+                        pbar.update(1)
+
+        # 按索引排序结果
+        results.sort(key=lambda x: x['index'])
+        
+        # 计算指标
+        metrics = self.metrics_calculator.compute_metrics(results)
+        
+        logger.info("Evaluation completed successfully")
+        return metrics, results
diff --git a/eval_framework/src/llm_client.py b/eval_framework/src/llm_client.py
new file mode 100644
index 0000000..6ce60ef
--- /dev/null
+++ b/eval_framework/src/llm_client.py
@@ -0,0 +1,60 @@
+import logging
+import time
+from typing import Optional
+from openai import OpenAI
+
+logger = logging.getLogger(__name__)
+
+class LLMClient:
+    """LLM客户端，负责与API交互"""
+    
+    def __init__(self, api_key: str, base_url: str, model: str, 
+                 temperature: float = 0, max_retries: int = 10):
+        """
+        初始化LLM客户端
+        
+        Args:
+            api_key: API密钥
+            base_url: API基础URL
+            model: 模型名称
+            temperature: 温度参数
+            max_retries: 最大重试次数
+        """
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model = model
+        self.temperature = temperature
+        self.max_retries = max_retries
+        
+    def get_response(self, user_input: str, system_prompt: str) -> str:
+        """
+        获取LLM响应
+        
+        Args:
+            user_input: 用户输入
+            system_prompt: 系统提示词
+            
+        Returns:
+            LLM响应，失败时返回"error!"
+        """
+        retries = 0
+        while retries < self.max_retries:
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_input}
+                    ],
+                    temperature=self.temperature
+                )
+                answer = response.choices[0].message.content
+                return answer
+                
+            except Exception as e:
+                retries += 1
+                logger.warning(f"API call failed (Attempt {retries}/{self.max_retries}): {e}")
+                if retries < self.max_retries:
+                    time.sleep(2 ** retries)  # 指数退避
+        
+        logger.error(f"Failed to get response after {self.max_retries} attempts")
+        return "error!"
diff --git a/eval_framework/src/metrics.py b/eval_framework/src/metrics.py
new file mode 100644
index 0000000..dcfb93d
--- /dev/null
+++ b/eval_framework/src/metrics.py
@@ -0,0 +1,111 @@
+import re
+import numpy as np
+from typing import List, Dict, Any, Optional
+from sklearn.metrics import precision_score, recall_score, f1_score
+import logging
+
+logger = logging.getLogger(__name__)
+
+class MetricsCalculator:
+    """评估指标计算器"""
+    
+    @staticmethod
+    def extract_answer(answer_string: str) -> Optional[str]:
+        """
+        从回答字符串中提取答案
+        
+        Args:
+            answer_string: 包含答案的字符串
+            
+        Returns:
+            提取的答案，如果没有找到返回None
+        """
+        if not answer_string:
+            return None
+            
+        match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', answer_string)
+        if match:
+            return match.group(1).strip()
+        return None
+
+    @staticmethod
+    def parse_answer(answer: Optional[str]) -> List[str]:
+        """
+        解析答案为列表
+        
+        Args:
+            answer: 答案字符串
+            
+        Returns:
+            答案列表
+        """
+        if answer is None:
+            return []
+        return [a.strip() for a in answer.split(',')]
+
+    @classmethod
+    def compute_metrics(cls, data: List[Dict[str, Any]]) -> Dict[str, float]:
+        """
+        计算评估指标
+        
+        Args:
+            data: 包含真实答案和预测答案的数据
+            
+        Returns:
+            各种评估指标的字典
+        """
+        true_answers = []
+        pred_answers = []
+        
+        # 提取和解析答案
+        for item in data:
+            true_ans = cls.extract_answer(item["answer"])
+            pred_ans = cls.extract_answer(item["llm_answer"])
+            
+            true_answers.append(cls.parse_answer(true_ans))
+            pred_answers.append(cls.parse_answer(pred_ans))
+        
+        # 计算准确率
+        correct_counts = []
+        for true_ans, pred_ans in zip(true_answers, pred_answers):
+            if true_ans and pred_ans and set(true_ans) == set(pred_ans):
+                correct_counts.append(1)
+            else:
+                correct_counts.append(0)
+
+        accuracy = np.mean(correct_counts)
+
+        # 构建多标签向量
+        all_labels = set()
+        for item in data:
+            choices = item["choices"]["label"]
+            for label in choices:
+                all_labels.add(label)
+        
+        all_labels = sorted(list(all_labels))
+        
+        y_true_multi = []
+        y_pred_multi = []
+        
+        for true_ans, pred_ans in zip(true_answers, pred_answers):
+            true_vector = [1 if label in (true_ans or []) else 0 for label in all_labels]
+            pred_vector = [1 if label in (pred_ans or []) else 0 for label in all_labels]
+            y_true_multi.append(true_vector)
+            y_pred_multi.append(pred_vector)
+        
+        y_true_multi = np.array(y_true_multi)
+        y_pred_multi = np.array(y_pred_multi)
+
+        # 计算各种指标
+        metrics = {
+            "accuracy": accuracy,
+            "precision_micro": precision_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "recall_micro": recall_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "f1_micro": f1_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "precision_macro": precision_score(y_true_multi, y_pred_multi, average='macro', zero_division=0),
+            "recall_macro": recall_score(y_true_multi, y_pred_multi, average='macro', zero_division=0),
+            "f1_macro": f1_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
+        }
+        
+        logger.info("Metrics computed successfully")
+        return metrics
diff --git a/eval_framework/src/utils.py b/eval_framework/src/utils.py
new file mode 100644
index 0000000..652f4a2
--- /dev/null
+++ b/eval_framework/src/utils.py
@@ -0,0 +1,360 @@
+import json
+import yaml
+import logging
+import pandas as pd
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, List
+from tabulate import tabulate
+
+def load_config(config_path: str) -> Dict[str, Any]:
+    """
+    加载配置文件
+    
+    Args:
+        config_path: 配置文件路径
+        
+    Returns:
+        配置字典
+    """
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    return config
+
+def get_models_from_config(config: Dict[str, Any]) -> List[str]:
+    """
+    从配置中获取模型列表
+    
+    Args:
+        config: 配置字典
+        
+    Returns:
+        模型名称列表
+    """
+    api_config = config['api']
+    
+    # 优先使用models列表
+    if 'models' in api_config and api_config['models']:
+        return api_config['models']
+    # 向后兼容：如果没有models，使用单个model
+    elif 'model' in api_config:
+        return [api_config['model']]
+    else:
+        raise ValueError("No models specified in configuration")
+
+def generate_output_dir(config: Dict[str, Any]) -> str:
+    """
+    生成输出目录路径
+    
+    Args:
+        config: 配置字典
+        
+    Returns:
+        输出目录路径
+    """
+    output_config = config['evaluation']['output']
+    base_dir = output_config['base_dir']
+    auto_timestamp = output_config.get('auto_timestamp', True)
+    
+    # 创建基础目录
+    base_path = Path(base_dir)
+    
+    if auto_timestamp:
+        # 创建时间戳文件夹 (年月日时分)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+        output_dir = base_path / timestamp
+    else:
+        output_dir = base_path
+    
+    # 确保目录存在
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    return str(output_dir)
+
+def generate_model_output_path(output_dir: str, model_name: str, filename_template: str) -> str:
+    """
+    为特定模型生成输出文件路径
+    
+    Args:
+        output_dir: 输出目录
+        model_name: 模型名称
+        filename_template: 文件名模板
+        
+    Returns:
+        完整的输出文件路径
+    """
+    # 处理模型名中的特殊字符
+    safe_model_name = model_name.replace('/', '_').replace(':', '_')
+    filename = filename_template.format(model=safe_model_name)
+    return str(Path(output_dir) / filename)
+
+def save_results(results: list, filepath: str) -> None:
+    """
+    保存结果到JSON文件
+    
+    Args:
+        results: 结果列表
+        filepath: 保存路径
+    """
+    # 确保目录存在
+    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+def save_metrics(metrics: Dict[str, float], filepath: str) -> None:
+    """
+    保存评估指标到JSON文件
+    
+    Args:
+        metrics: 指标字典
+        filepath: 保存路径
+    """
+    # 生成指标文件路径（在同一目录下）
+    metrics_path = Path(filepath).parent / f"{Path(filepath).stem}_metrics.json"
+    
+    # 添加时间戳和其他元信息
+    metrics_with_meta = {
+        "timestamp": datetime.now().isoformat(),
+        "metrics": metrics
+    }
+    
+    with open(metrics_path, 'w', encoding='utf-8') as f:
+        json.dump(metrics_with_meta, f, indent=2, ensure_ascii=False)
+
+def create_results_dataframe(all_results: Dict[str, Dict]) -> pd.DataFrame:
+    """
+    将所有模型的结果转换为DataFrame
+    
+    Args:
+        all_results: 所有模型的结果字典
+        
+    Returns:
+        包含所有模型指标的DataFrame
+    """
+    if not all_results:
+        return pd.DataFrame()
+    
+    # 收集所有模型的指标数据
+    data = []
+    for model_name, model_result in all_results.items():
+        row = {"Model": model_name}
+        row.update(model_result["metrics"])
+        row["Data Count"] = len(model_result["results"])
+        data.append(row)
+    
+    # 创建DataFrame
+    df = pd.DataFrame(data)
+    
+    # 将Model列设为索引
+    df = df.set_index("Model")
+    
+    # 对列进行排序（将Data Count放在最后）
+    metric_columns = [col for col in df.columns if col != "Data Count"]
+    df = df[metric_columns + ["Data Count"]]
+    
+    return df
+
+def save_summary(all_results: Dict[str, Dict], output_dir: str, summary_filename: str) -> None:
+    """
+    保存所有模型的汇总结果
+    
+    Args:
+        all_results: 所有模型的结果字典
+        output_dir: 输出目录
+        summary_filename: 汇总文件名
+    """
+    output_path = Path(output_dir)
+    
+    # 创建DataFrame
+    df = create_results_dataframe(all_results)
+    
+    if df.empty:
+        logging.warning("No results to save in summary")
+        return
+    
+    # 保存JSON格式的详细汇总
+    summary_path = output_path / summary_filename
+    summary_data = {
+        "timestamp": datetime.now().isoformat(),
+        "models_count": len(all_results),
+        "models": {}
+    }
+    
+    for model_name, model_result in all_results.items():
+        summary_data["models"][model_name] = {
+            "metrics": model_result["metrics"],
+            "data_count": len(model_result["results"])
+        }
+    
+    # 添加模型对比表
+    if len(all_results) > 1:
+        comparison = {}
+        metric_names = [col for col in df.columns if col != "Data Count"]
+        
+        for metric in metric_names:
+            comparison[metric] = df[metric].to_dict()
+        
+        summary_data["comparison"] = comparison
+    
+    with open(summary_path, 'w', encoding='utf-8') as f:
+        json.dump(summary_data, f, indent=2, ensure_ascii=False)
+    
+    # 保存CSV格式的汇总表格
+    csv_filename = summary_filename.replace('.json', '.csv')
+    csv_path = output_path / csv_filename
+    
+    # 重置索引以便模型名称也作为列保存
+    df_for_csv = df.reset_index()
+    df_for_csv.to_csv(csv_path, index=False, encoding='utf-8')
+    
+    # 保存Excel格式（如果需要）
+    excel_filename = summary_filename.replace('.json', '.xlsx')
+    excel_path = output_path / excel_filename
+    
+    try:
+        # 创建Excel文件，包含多个工作表
+        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
+            # 主要结果表
+            df_for_csv.to_excel(writer, sheet_name='Summary', index=False)
+            
+            # 如果有多个模型，创建排名表
+            if len(all_results) > 1:
+                ranking_df = create_ranking_dataframe(df)
+                ranking_df.to_excel(writer, sheet_name='Rankings', index=False)
+    
+    except ImportError:
+        logging.warning("openpyxl not installed, skipping Excel export")
+    
+    logging.info(f"Summary saved to {summary_path}")
+    logging.info(f"CSV summary saved to {csv_path}")
+
+def create_ranking_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    创建模型排名DataFrame
+    
+    Args:
+        df: 原始结果DataFrame
+        
+    Returns:
+        包含排名的DataFrame
+    """
+    # 排除非指标列
+    metric_columns = [col for col in df.columns if col != "Data Count"]
+    
+    # 为每个指标创建排名（假设数值越大越好，可以根据需要调整）
+    ranking_data = []
+    
+    for metric in metric_columns:
+        # 创建排名（降序，数值越大排名越前）
+        ranks = df[metric].rank(method='min', ascending=False)
+        
+        for model_name in df.index:
+            ranking_data.append({
+                'Model': model_name,
+                'Metric': metric,
+                'Value': df.loc[model_name, metric],
+                'Rank': int(ranks[model_name])
+            })
+    
+    ranking_df = pd.DataFrame(ranking_data)
+    return ranking_df
+
+def print_summary(all_results: Dict[str, Dict]) -> None:
+    """
+    打印所有模型的汇总结果
+    
+    Args:
+        all_results: 所有模型的结果字典
+    """
+    print("\n" + "="*100)
+    print("SUMMARY - ALL MODELS COMPARISON")
+    print("="*100)
+    
+    if not all_results:
+        print("No results to display")
+        return
+    
+    # 创建DataFrame
+    df = create_results_dataframe(all_results)
+    
+    if df.empty:
+        print("No valid results to display")
+        return
+    
+    # 使用tabulate打印美观的表格
+    print(tabulate(
+        df, 
+        headers=df.columns, 
+        tablefmt='grid',
+        floatfmt='.4f',
+        showindex=True
+    ))
+    
+    # 如果有多个模型，显示最佳模型
+    if len(all_results) > 1:
+        print("\n" + "-"*100)
+        print("BEST PERFORMERS BY METRIC:")
+        print("-"*100)
+        
+        metric_columns = [col for col in df.columns if col != "Data Count"]
+        
+        for metric in metric_columns:
+            best_model = df[metric].idxmax()
+            best_value = df.loc[best_model, metric]
+            print(f"{metric.upper():<20}: {best_model:<30} ({best_value:.4f})")
+    
+    print("="*100)
+
+def setup_logging(level: str = "INFO", format_str: str = None, log_dir: str = "logs") -> None:
+    """
+    设置日志配置
+    
+    Args:
+        level: 日志级别
+        format_str: 日志格式
+        log_dir: 日志目录
+    """
+    if format_str is None:
+        format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    
+    # 创建日志目录
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    
+    # 生成日志文件名（包含时间戳）
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+    log_file = Path(log_dir) / f"evaluation_{timestamp}.log"
+    
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format=format_str,
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler(log_file, encoding='utf-8')
+        ]
+    )
+
+def print_metrics(metrics: Dict[str, float], model_name: str = None) -> None:
+    """
+    打印评估指标
+    
+    Args:
+        metrics: 指标字典
+        model_name: 模型名称
+    """
+    title = f"EVALUATION RESULTS - {model_name}" if model_name else "EVALUATION RESULTS"
+    print("\n" + "="*60)
+    print(title)
+    print("="*60)
+    
+    # 创建单行DataFrame用于美观显示
+    df = pd.DataFrame([metrics])
+    print(tabulate(
+        df, 
+        headers=df.columns, 
+        tablefmt='grid',
+        floatfmt='.4f',
+        showindex=False
+    ))
+    
+    print("="*60)
diff --git a/layer1/ALL-merge/eval.py b/layer1/ALL-merge/eval.py
index f914e59..e69de29 100644
--- a/layer1/ALL-merge/eval.py
+++ b/layer1/ALL-merge/eval.py
@@ -1,166 +0,0 @@
-import json
-import threading
-from tqdm import tqdm
-import concurrent.futures
-from openai import OpenAI
-import numpy as np
-from sklearn.metrics import precision_score, recall_score, f1_score
-import re
-
-client = OpenAI(
-    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
-    base_url="https://vip.apiyi.com/v1"
-)
-
-thread_lock = threading.Lock()
-
-def load_json_data(filepath):
-    with open(filepath, 'r') as file:
-        data = json.load(file)
-    return data
-
-def get_response(input,max_retries=10):
-    retries = 0
-    while retries<max_retries:
-        try:
-            response = client.chat.completions.create(
-                #
-                model="qwen-max-2025-01-25",
-                messages= [
-                    {"role": "system", "content": "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."},
-                    {"role": "user", "content": input}
-                ],
-                temperature=0
-            )
-            answer = response.choices[0].message.content
-            return answer
-        except Exception as e:
-            print(f"Error in getting LLM response (Attempt {retries + 1}/{max_retries}): {e}")
-            retries += 1
-    
-    print(f"Failed to get response after {max_retries} attempts, returning None.")
-    return "error!"
-
-def process_item(item, index):
-    question = item['question']
-    text = item['choices']['text']
-    label = item['choices']['label']
-    prompt = item['prompt']
-    expected_answer = item['answer'].strip()
-
-    formatted_choices = " ".join([f"({label}) {text}" for label, text in zip(label, text)])
-    input = f"{question} {formatted_choices}. {prompt}"
-    
-    llm_answer = get_response(input)
-
-    return {
-        'index': index,
-        'question': question,
-        'choices': item['choices'],
-        'answer': expected_answer,
-        'llm_answer': llm_answer
-    }
-
-def extract_answer(answer_string):
-    match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', answer_string)
-    if match:
-        return match.group(1).strip()
-    return None
-
-
-def parse_answer(answer):
-    if answer is None:
-        return []
-    return [a.strip() for a in answer.split(',')]
-
-def compute_metrics(data):
-
-    true_answers = []
-    pred_answers = []
-    
-    for item in data:
-        true_ans = extract_answer(item["answer"])
-        pred_ans = extract_answer(item["llm_answer"])
-        
-        true_answers.append(parse_answer(true_ans))
-        pred_answers.append(parse_answer(pred_ans))
-    
-    correct_counts = []
-    for true_ans, pred_ans in zip(true_answers, pred_answers):
-        if true_ans and pred_ans and set(true_ans) == set(pred_ans):
-            correct_counts.append(1)
-        else:
-            correct_counts.append(0)
-
-    accuracy = np.mean(correct_counts)
-
-    y_true_multi = []
-    y_pred_multi = []
-    all_labels = set()
-    
-    for item in data:
-        choices = item["choices"]["label"]
-        for label in choices:
-            all_labels.add(label)
-    
-    all_labels = sorted(list(all_labels))
-    
-    for true_ans, pred_ans in zip(true_answers, pred_answers):
-        true_vector = [1 if label in true_ans else 0 for label in all_labels]
-        pred_vector = [1 if label in pred_ans else 0 for label in all_labels]
-        y_true_multi.append(true_vector)
-        y_pred_multi.append(pred_vector)
-    
-    y_true_multi = np.array(y_true_multi)
-    y_pred_multi = np.array(y_pred_multi)
-
-    precision_micro = precision_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    recall_micro = recall_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    f1_micro = f1_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    
-    precision_macro = precision_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    recall_macro = recall_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    f1_macro = f1_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    
-    return {
-        "accuracy": accuracy,
-        "precision_micro": precision_micro,
-        "recall_micro": recall_micro,
-        "f1_micro": f1_micro,
-        "precision_macro": precision_macro,
-        "recall_macro": recall_macro,
-        "f1_macro": f1_macro
-    }
-
-def calculate_accuracy_multithreaded(data, max_workers=5):
-    results = []
-
-    with tqdm(total=len(data), desc="Processing items") as pbar:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-
-            future_to_index = {executor.submit(process_item, item, i): i for i, item in enumerate(data)}
-
-            for future in concurrent.futures.as_completed(future_to_index):
-                result = future.result()
-                results.append(result)
-                pbar.update(1)
-
-    results.sort(key=lambda x: x['index'])
-
-    metric = compute_metrics(results)
-
-    return metric, results
-
-def main():
-    filepath = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge/merged.json'
-    data = load_json_data(filepath)
-    max_workers = 8
-
-    metric, results = calculate_accuracy_multithreaded(data,max_workers)
-    print(f"Accuracy of qwen-max-2025-01-25: {metric}")
-
-    with open('qwen-max-2025-01-25.json', 'w') as f:
-        json.dump(results, f, indent=2)
-
-if __name__ == "__main__":
-    main()
diff --git a/logs/evaluation_20250528_1530.log b/logs/evaluation_20250528_1530.log
new file mode 100644
index 0000000..33b6c49
--- /dev/null
+++ b/logs/evaluation_20250528_1530.log
@@ -0,0 +1,40 @@
+2025-05-28 15:30:36,536 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:30:36,536 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:30:36,543 - __main__ - INFO - Output directory: results/20250528_1530
+2025-05-28 15:30:36,543 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:30:36,568 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:30:36,569 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:30:36,569 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:30:36,569 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:30:36,569 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:30:36,595 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:30:38,447 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,461 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,485 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,499 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,549 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,613 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,630 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:39,998 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:40,267 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:40,287 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:30:40,288 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:30:40,302 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1530/qwen-max-2025-01-25.json
+2025-05-28 15:30:40,302 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:30:40,302 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:30:40,352 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:30:41,778 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:41,794 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:41,826 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,016 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,026 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,040 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,041 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,076 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,295 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,323 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:30:42,323 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:30:42,333 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1530/gpt-4o.json
+2025-05-28 15:30:42,333 - __main__ - ERROR - Evaluation failed: 'summary_filename'
diff --git a/logs/evaluation_20250528_1531.log b/logs/evaluation_20250528_1531.log
new file mode 100644
index 0000000..2e34b71
--- /dev/null
+++ b/logs/evaluation_20250528_1531.log
@@ -0,0 +1,41 @@
+2025-05-28 15:31:25,896 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:31:25,896 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:31:25,899 - __main__ - INFO - Output directory: results/20250528_1531
+2025-05-28 15:31:25,899 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:31:25,925 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:31:25,927 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:31:25,927 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:31:25,927 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:31:25,927 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:31:25,952 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:31:28,342 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,434 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,444 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,538 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,703 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,085 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,353 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,374 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:31:30,374 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:31:30,387 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1531/qwen-max-2025-01-25.json
+2025-05-28 15:31:30,387 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:31:30,387 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:31:30,436 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:31:31,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:31,886 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,119 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,139 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,140 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,162 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,449 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,539 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:38,330 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:38,351 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:31:38,351 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:31:38,366 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1531/gpt-4o.json
+2025-05-28 15:31:38,372 - __main__ - INFO - Summary saved to results/20250528_1531/summary.json
+2025-05-28 15:31:38,372 - __main__ - INFO - Multi-model evaluation completed successfully
diff --git a/logs/evaluation_20250528_1535.log b/logs/evaluation_20250528_1535.log
new file mode 100644
index 0000000..54d651d
--- /dev/null
+++ b/logs/evaluation_20250528_1535.log
@@ -0,0 +1,44 @@
+2025-05-28 15:35:59,778 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:35:59,779 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:35:59,782 - __main__ - INFO - Output directory: results/20250528_1535
+2025-05-28 15:35:59,782 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:35:59,808 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:35:59,809 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:35:59,809 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:35:59,809 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:35:59,809 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:35:59,835 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:36:01,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,780 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,809 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,853 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,876 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,910 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:02,847 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:02,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:03,432 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:03,454 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:36:03,454 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:36:03,477 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1535/qwen-max-2025-01-25.json
+2025-05-28 15:36:03,480 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:36:03,481 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:36:03,534 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:36:04,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,895 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,930 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,495 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,514 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:36:05,515 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:36:05,532 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1535/gpt-4o.json
+2025-05-28 15:36:05,564 - root - WARNING - openpyxl not installed, skipping Excel export
+2025-05-28 15:36:05,564 - root - INFO - Summary saved to results/20250528_1535/summary.json
+2025-05-28 15:36:05,564 - root - INFO - CSV summary saved to results/20250528_1535/summary.csv
+2025-05-28 15:36:05,568 - __main__ - INFO - Summary saved to results/20250528_1535/summary.json
+2025-05-28 15:36:05,568 - __main__ - INFO - Multi-model evaluation completed successfully
diff --git a/results/20250528_1530/gpt-4o.json b/results/20250528_1530/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1530/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1530/gpt-4o_metrics.json b/results/20250528_1530/gpt-4o_metrics.json
new file mode 100644
index 0000000..8d6d171
--- /dev/null
+++ b/results/20250528_1530/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:30:42.329641",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1530/qwen-max-2025-01-25.json b/results/20250528_1530/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1530/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1530/qwen-max-2025-01-25_metrics.json b/results/20250528_1530/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..00a04d8
--- /dev/null
+++ b/results/20250528_1530/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:30:40.296801",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/gpt-4o.json b/results/20250528_1531/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1531/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1531/gpt-4o_metrics.json b/results/20250528_1531/gpt-4o_metrics.json
new file mode 100644
index 0000000..2d9eadb
--- /dev/null
+++ b/results/20250528_1531/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:31:38.361064",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/qwen-max-2025-01-25.json b/results/20250528_1531/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1531/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1531/qwen-max-2025-01-25_metrics.json b/results/20250528_1531/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..cc49ec9
--- /dev/null
+++ b/results/20250528_1531/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:31:30.382105",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/summary.json b/results/20250528_1531/summary.json
new file mode 100644
index 0000000..c40d636
--- /dev/null
+++ b/results/20250528_1531/summary.json
@@ -0,0 +1,60 @@
+{
+  "timestamp": "2025-05-28T15:31:38.366535",
+  "models_count": 2,
+  "models": {
+    "qwen-max-2025-01-25": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    },
+    "gpt-4o": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    }
+  },
+  "comparison": {
+    "accuracy": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/gpt-4o.json b/results/20250528_1535/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1535/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1535/gpt-4o_metrics.json b/results/20250528_1535/gpt-4o_metrics.json
new file mode 100644
index 0000000..c21653f
--- /dev/null
+++ b/results/20250528_1535/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:36:05.524328",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/qwen-max-2025-01-25.json b/results/20250528_1535/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1535/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1535/qwen-max-2025-01-25_metrics.json b/results/20250528_1535/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..f706817
--- /dev/null
+++ b/results/20250528_1535/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:36:03.466534",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/summary.csv b/results/20250528_1535/summary.csv
new file mode 100644
index 0000000..661edb4
--- /dev/null
+++ b/results/20250528_1535/summary.csv
@@ -0,0 +1,3 @@
+Model,accuracy,precision_micro,recall_micro,f1_micro,precision_macro,recall_macro,f1_macro,Data Count
+qwen-max-2025-01-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
+gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
diff --git a/results/20250528_1535/summary.json b/results/20250528_1535/summary.json
new file mode 100644
index 0000000..5520f4a
--- /dev/null
+++ b/results/20250528_1535/summary.json
@@ -0,0 +1,60 @@
+{
+  "timestamp": "2025-05-28T15:36:05.540751",
+  "models_count": 2,
+  "models": {
+    "qwen-max-2025-01-25": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    },
+    "gpt-4o": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    }
+  },
+  "comparison": {
+    "accuracy": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    }
+  }
+}
\ No newline at end of file