From 9abd8fc1c50391e3111feae3ae8b8d560facd140 Mon Sep 17 00:00:00 2001
From: lzy <949777411@qq.com>
Date: Wed, 28 May 2025 15:43:50 +0800
Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84eval=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 eval_framework/config/config.yaml             |  36 ++
 eval_framework/main.py                        | 164 ++++++++
 eval_framework/src/__init__.py                |  26 ++
 .../src/__pycache__/__init__.cpython-311.pyc  | Bin 0 -> 741 bytes
 .../src/__pycache__/__init__.cpython-312.pyc  | Bin 0 -> 736 bytes
 .../__pycache__/data_loader.cpython-311.pyc   | Bin 0 -> 4253 bytes
 .../__pycache__/data_loader.cpython-312.pyc   | Bin 0 -> 3708 bytes
 .../src/__pycache__/evaluator.cpython-311.pyc | Bin 0 -> 5717 bytes
 .../src/__pycache__/evaluator.cpython-312.pyc | Bin 0 -> 4719 bytes
 .../__pycache__/llm_client.cpython-311.pyc    | Bin 0 -> 3002 bytes
 .../__pycache__/llm_client.cpython-312.pyc    | Bin 0 -> 2900 bytes
 .../src/__pycache__/metrics.cpython-311.pyc   | Bin 0 -> 5731 bytes
 .../src/__pycache__/metrics.cpython-312.pyc   | Bin 0 -> 4623 bytes
 .../src/__pycache__/utils.cpython-311.pyc     | Bin 0 -> 6339 bytes
 .../src/__pycache__/utils.cpython-312.pyc     | Bin 0 -> 13397 bytes
 eval_framework/src/data_loader.py             |  81 ++++
 eval_framework/src/evaluator.py               |  98 +++++
 eval_framework/src/llm_client.py              |  60 +++
 eval_framework/src/metrics.py                 | 111 ++++++
 eval_framework/src/utils.py                   | 360 ++++++++++++++++++
 layer1/ALL-merge/eval.py                      | 166 --------
 logs/evaluation_20250528_1530.log             |  40 ++
 logs/evaluation_20250528_1531.log             |  41 ++
 logs/evaluation_20250528_1535.log             |  44 +++
 results/20250528_1530/gpt-4o.json             | 202 ++++++++++
 results/20250528_1530/gpt-4o_metrics.json     |  12 +
 .../20250528_1530/qwen-max-2025-01-25.json    | 202 ++++++++++
 .../qwen-max-2025-01-25_metrics.json          |  12 +
 results/20250528_1531/gpt-4o.json             | 202 ++++++++++
 results/20250528_1531/gpt-4o_metrics.json     |  12 +
 .../20250528_1531/qwen-max-2025-01-25.json    | 202 ++++++++++
 .../qwen-max-2025-01-25_metrics.json          |  12 +
 results/20250528_1531/summary.json            |  60 +++
 results/20250528_1535/gpt-4o.json             | 202 ++++++++++
 results/20250528_1535/gpt-4o_metrics.json     |  12 +
 .../20250528_1535/qwen-max-2025-01-25.json    | 202 ++++++++++
 .../qwen-max-2025-01-25_metrics.json          |  12 +
 results/20250528_1535/summary.csv             |   3 +
 results/20250528_1535/summary.json            |  60 +++
 39 files changed, 2468 insertions(+), 166 deletions(-)
 create mode 100644 eval_framework/config/config.yaml
 create mode 100644 eval_framework/main.py
 create mode 100644 eval_framework/src/__init__.py
 create mode 100644 eval_framework/src/__pycache__/__init__.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/__init__.cpython-312.pyc
 create mode 100644 eval_framework/src/__pycache__/data_loader.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/data_loader.cpython-312.pyc
 create mode 100644 eval_framework/src/__pycache__/evaluator.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/evaluator.cpython-312.pyc
 create mode 100644 eval_framework/src/__pycache__/llm_client.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/llm_client.cpython-312.pyc
 create mode 100644 eval_framework/src/__pycache__/metrics.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/metrics.cpython-312.pyc
 create mode 100644 eval_framework/src/__pycache__/utils.cpython-311.pyc
 create mode 100644 eval_framework/src/__pycache__/utils.cpython-312.pyc
 create mode 100644 eval_framework/src/data_loader.py
 create mode 100644 eval_framework/src/evaluator.py
 create mode 100644 eval_framework/src/llm_client.py
 create mode 100644 eval_framework/src/metrics.py
 create mode 100644 eval_framework/src/utils.py
 create mode 100644 logs/evaluation_20250528_1530.log
 create mode 100644 logs/evaluation_20250528_1531.log
 create mode 100644 logs/evaluation_20250528_1535.log
 create mode 100644 results/20250528_1530/gpt-4o.json
 create mode 100644 results/20250528_1530/gpt-4o_metrics.json
 create mode 100644 results/20250528_1530/qwen-max-2025-01-25.json
 create mode 100644 results/20250528_1530/qwen-max-2025-01-25_metrics.json
 create mode 100644 results/20250528_1531/gpt-4o.json
 create mode 100644 results/20250528_1531/gpt-4o_metrics.json
 create mode 100644 results/20250528_1531/qwen-max-2025-01-25.json
 create mode 100644 results/20250528_1531/qwen-max-2025-01-25_metrics.json
 create mode 100644 results/20250528_1531/summary.json
 create mode 100644 results/20250528_1535/gpt-4o.json
 create mode 100644 results/20250528_1535/gpt-4o_metrics.json
 create mode 100644 results/20250528_1535/qwen-max-2025-01-25.json
 create mode 100644 results/20250528_1535/qwen-max-2025-01-25_metrics.json
 create mode 100644 results/20250528_1535/summary.csv
 create mode 100644 results/20250528_1535/summary.json

diff --git a/eval_framework/config/config.yaml b/eval_framework/config/config.yaml
new file mode 100644
index 0000000..8ef7f91
--- /dev/null
+++ b/eval_framework/config/config.yaml
@@ -0,0 +1,36 @@
+# API配置
+api:
+  key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
+  base_url: "https://vip.apiyi.com/v1"
+  temperature: 0
+  max_retries: 10
+  # 支持多个模型
+  models:
+    - "qwen-max-2025-01-25"
+    - "gpt-4o"
+  # 或者使用单个模型（向后兼容）
+  # model: "qwen-max-2025-01-25"
+
+# 系统提示词
+system_prompt: "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."
+
+# 评估配置
+evaluation:
+  max_workers: 8
+  input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
+  # 输出配置
+  output:
+    base_dir: "results"
+    auto_timestamp: true
+    filename_template: "{model}.json"
+    summary_filename: "summary.json"
+    # 输出格式选项
+    export_formats:
+      - "json"    # 详细JSON结果
+      - "csv"     # CSV表格
+      - "excel"   # Excel表格（需要openpyxl）
+
+# 日志配置
+logging:
+  level: "INFO"
+  format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
diff --git a/eval_framework/main.py b/eval_framework/main.py
new file mode 100644
index 0000000..6c7bf2b
--- /dev/null
+++ b/eval_framework/main.py
@@ -0,0 +1,164 @@
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, Any
+
+from src import (
+    DataLoader, LLMClient, Evaluator, 
+    load_config, save_results, save_metrics, save_summary,
+    setup_logging, print_metrics, print_summary,
+    get_models_from_config, generate_output_dir, generate_model_output_path
+)
+
+logger = logging.getLogger(__name__)
+
+def evaluate_single_model(
+    model_name: str, 
+    data: list, 
+    config: Dict[str, Any], 
+    output_dir: str
+) -> Dict[str, Any]:
+    """
+    评估单个模型
+    
+    Args:
+        model_name: 模型名称
+        data: 评估数据
+        config: 配置字典
+        output_dir: 输出目录
+        
+    Returns:
+        包含指标和结果的字典
+    """
+    logger.info(f"Starting evaluation for model: {model_name}")
+    
+    # 初始化LLM客户端
+    llm_client = LLMClient(
+        api_key=config['api']['key'],
+        base_url=config['api']['base_url'],
+        model=model_name,
+        temperature=config['api']['temperature'],
+        max_retries=config['api']['max_retries']
+    )
+    
+    # 初始化评估器
+    evaluator = Evaluator(
+        llm_client=llm_client,
+        system_prompt=config['system_prompt']
+    )
+    
+    # 执行评估
+    max_workers = config['evaluation']['max_workers']
+    metrics, results = evaluator.evaluate(data, max_workers=max_workers)
+    
+    # 生成输出文件路径
+    filename_template = config['evaluation']['output']['filename_template']
+    output_file = generate_model_output_path(output_dir, model_name, filename_template)
+    
+    # 保存结果和指标
+    save_results(results, output_file)
+    save_metrics(metrics, output_file)
+    
+    logger.info(f"Model {model_name} evaluation completed. Results saved to {output_file}")
+    
+    return {
+        "metrics": metrics,
+        "results": results,
+        "output_file": output_file
+    }
+
+def main():
+    parser = argparse.ArgumentParser(description="材料科学LLM评估框架")
+    parser.add_argument("--config", default="eval_framework/config/config.yaml", help="配置文件路径")
+    parser.add_argument("--input", help="输入数据文件路径（覆盖配置文件）")
+    parser.add_argument("--output-dir", help="输出目录路径（覆盖配置文件）")
+    parser.add_argument("--workers", type=int, help="工作线程数（覆盖配置文件）")
+    parser.add_argument("--models", nargs="+", help="指定要评估的模型列表（覆盖配置文件）")
+    parser.add_argument("--no-timestamp", action="store_true", help="不使用时间戳文件夹")
+    
+    args = parser.parse_args()
+    
+    # 加载配置
+    config = load_config(args.config)
+    
+    # 如果指定了不使用时间戳，修改配置
+    if args.no_timestamp:
+        config['evaluation']['output']['auto_timestamp'] = False
+    
+    # 设置日志
+    setup_logging(
+        level=config.get('logging', {}).get('level', 'INFO'),
+        format_str=config.get('logging', {}).get('format')
+    )
+    
+    logger.info("Starting multi-model evaluation framework")
+    
+    # 处理输入路径和工作线程数
+    input_file = args.input or config['evaluation']['input_file']
+    if args.workers:
+        config['evaluation']['max_workers'] = args.workers
+    
+    # 获取模型列表
+    if args.models:
+        models = args.models
+        logger.info(f"Using models from command line: {models}")
+    else:
+        models = get_models_from_config(config)
+        logger.info(f"Using models from config: {models}")
+    
+    # 生成输出目录
+    if args.output_dir:
+        output_dir = args.output_dir
+        Path(output_dir).mkdir(parents=True, exist_ok=True)
+    else:
+        output_dir = generate_output_dir(config)
+    
+    logger.info(f"Output directory: {output_dir}")
+    
+    try:
+        # 加载数据
+        logger.info(f"Loading data from {input_file}")
+        data = DataLoader.load_and_validate_data(input_file)
+        
+        if not data:
+            logger.error("No valid data found")
+            return
+        
+        logger.info(f"Loaded {len(data)} valid data items")
+        
+        # 存储所有模型的结果
+        all_results = {}
+        
+        # 逐个评估模型
+        for i, model_name in enumerate(models, 1):
+            logger.info(f"Evaluating model {i}/{len(models)}: {model_name}")
+            
+            try:
+                model_result = evaluate_single_model(model_name, data[:10], config, output_dir)
+                all_results[model_name] = model_result
+                
+                # 打印当前模型的结果
+                print_metrics(model_result["metrics"], model_name)
+                
+            except Exception as e:
+                logger.error(f"Failed to evaluate model {model_name}: {e}")
+                continue
+        
+        # 保存汇总结果
+        if all_results:
+            summary_filename = config['evaluation']['output']['summary_filename']
+            save_summary(all_results, output_dir, summary_filename)
+            
+            # 打印汇总对比
+            print_summary(all_results)
+            
+            logger.info(f"Summary saved to {Path(output_dir) / summary_filename}")
+        
+        logger.info("Multi-model evaluation completed successfully")
+        
+    except Exception as e:
+        logger.error(f"Evaluation failed: {e}")
+        raise
+
+if __name__ == "__main__":
+    main()
diff --git a/eval_framework/src/__init__.py b/eval_framework/src/__init__.py
new file mode 100644
index 0000000..d534827
--- /dev/null
+++ b/eval_framework/src/__init__.py
@@ -0,0 +1,26 @@
+from .data_loader import DataLoader
+from .llm_client import LLMClient
+from .evaluator import Evaluator
+from .metrics import MetricsCalculator
+from .utils import (
+    load_config, save_results, save_metrics, save_summary,
+    setup_logging, print_metrics, print_summary,
+    get_models_from_config, generate_output_dir, generate_model_output_path
+)
+
+__all__ = [
+    'DataLoader',
+    'LLMClient', 
+    'Evaluator',
+    'MetricsCalculator',
+    'load_config',
+    'save_results',
+    'save_metrics',
+    'save_summary',
+    'setup_logging',
+    'print_metrics',
+    'print_summary',
+    'get_models_from_config',
+    'generate_output_dir',
+    'generate_model_output_path'
+]
diff --git a/eval_framework/src/__pycache__/__init__.cpython-311.pyc b/eval_framework/src/__pycache__/__init__.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..430cb87e831b351d040f4800820f084f95f89f82
GIT binary patch
literal 741
zcmb`EF>ll`6vyr4a=9dzQlyF*gjDI4h6SdIfx>pjLKi$>nZ(USJSUN37qRgzx_3cc
z8Tk}9p1d+3wv)Pb!ZvrqEuHE-{q%pf{QmF7Kk__9c4fa_E&pJIe!FD1f~~myWW^P#
zP=x^rOfYsjfIx&K^g4u*h)Cr12*x5NvDXnyL_!jgl2l|Q6H_u3ImyKVIl!n!)%Y5d
zM^*d*%_e`XJDOpWzGaHdJ63UNvt&MBoI%4|<&t-o3^Y@nJ9)g|N;c(i#-P;TH^(Le
zdy$r1d(qTpI<QMlB_A}XVVjBnmrOozrF#lpT{mrQ@?JKrqPx?hnzvjs#c8KiuNCc?
zTFz1#+7{)=A#xZy#10dO#KHYdnK?`?-p>wAR@s59lbgs_Fe!kbrJu@IGYQ`f<_nv6
z+lY5CkyZ^1_Zyj*QU*Zj4Z5|y=<c*wc7hlBvu>3xUcNXl<{!_C1ygT$TP_P{NG~K4
z{A(w_6oV`aN}IM(l%Dk8?k1<At2FS}@`)|xUS{}du?aE8YxI1zA8Yh%wI6HrbhRIA
cbiCS+O%UQ>gB}iPmf3nce2?8Hk8Ioj0I_t$N&o-=

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/__init__.cpython-312.pyc b/eval_framework/src/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58a17be226cba59257eb1e5ca815c26ba457282a
GIT binary patch
literal 736
zcmb7?y^ho{5XbFgv)O#@#~niGDi9KA8*&X)CkhU_U8-wvW0}O4jj)rIKMt)#!&A^-
z$20H<yn&^ysOS(^oOBho+0f(Sm-%~UJmZPK7exx}`t<F^<~ji2hZDoH+OHdCKVLu%
zYRCYG2tsEEEZ`vuy&bX<k5J_85gYRu#omtCgeNHBDN1>UGM=NH7pUM<H03ii<8w6U
z3$)-z=m>%a)Z=T2j(72L@)O#UOK8&9M3GfbYAUQsR;%?nYiXyPcymRVCaQOy#hNP7
zR`Q&%N;AKinvB^*T=m^$+n8LED~bh`nkjiu{BP1hNzFMCJ5xxi^cJ(eY1*za#a6VP
z!b9vq4#D>uYJGi8nZ%c(=R>*UhIUjCMRBjyRx4b$!kqlu^cjO|OVnmL6`{Sd7&$}^
zV~5yb;*dDFe<d=9+@WxoI?NpA7H^mLOjg^W+5X+s{HZ2o%y#A{=PQ_m4#V@sOnkVW
zuWBN#S|<O$xQQ_)jA8s0+}K{g-C4QmIW6^j-6>r@KfNed?=H$UQLkuMZA#aKof_fv
zV=q3GQdA|zZPzM{pKW({lQTOe&FD*UVh?jKLO!+l6+#I2;PD<j+JlFC@Z>fa!Qd7Q
O_?^$P;Bzb<*w{besLLJz

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/data_loader.cpython-311.pyc b/eval_framework/src/__pycache__/data_loader.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07ff9cb200a29f24848e7966a9c8dec83d0b0f1c
GIT binary patch
literal 4253
zcmbsrZERE5^}a8E#l+4BgoKv7j6kqTal1l7S=TmcfGu&;(V$6Woh;+`oR~Uxc<-4I
z%ULN)H8%<+tLT!hpqaJH!hn(=v<qwK*oXG}8Ck-5QY56|Ec}`g{ek*x=iX;O$99k^
z?Y{fYJ@=lk`*qH_f84OahM@fGM*GkyD?<OqokC&r$Rh!e86+Tq3ZVhLr3NVI(;<48
z8DNIl0d|-h;3z~RkveWFC=kta6hc?v*UNx~Mu(8VOdx@sqk#f{o?;cKe&pgF;BKFb
z(O6GVRy4Ld=vOqRGd$+cGm^%V?>zboATvlrcrpXDKn*Y=E6^e*FrrB`i=1c)(2a&<
z5eu*aJHgHY<%&Tau!1aCDFwQ0z;CMHuM+Lw(U!SFc7tGn<ytjcw@>kTB0fQsqR+xq
zRz6x-Is18bGP!!|R`%oh|1M3f-uT_>jZgC<A5J~^)70wqcXc++5(tLG5uY-onIut(
zN@2gT)u77^KLh`x3otr^tVmfyjl*FI^i^gKIIdtWQud;8N-=r}9XAdrPL&?zI4!^)
zmLDU9!d2VQ^GIe#=~K*6G)htED5Bs~j7xz~vT(5qW=Sh>N>y<nPM6yI1PP`%`UG8M
zOUEe`r~A=1q}YqHtw?GvvxxViq6lW9vCI;M&LG8DH|#;-wjt2FW1J~j$@+RNSg&A>
zGwaRw3|#(k<`>K;HH!2-a@nFg;b?4pzTE%ngP#63`c{7PtNUMlsh_-l;0A|*bV`Hr
zexrxw3s+)4U(~Vs=j++8Cra3i&YR@olqu1T{^z$7`TLldS-muGh+s2D=S^Qw7R$sB
z!nN#+C<i0auy8<<BGP_dU(Ef*v)OC2*$bt*j?0noZamv=(H{}Syabtkaw)rX?!ozw
zSFc|;ENYY#<DyF7*?lfbvxs4k35EyzV)gw|zh9K)Kr|E@<3re~1U_cxgNisT^8qO`
z%*UK~YJ51N@Buuz{d{aAw!;g=GB4uE^1(14qxbXRo@``93~MZ&pvGd2nkf_+926yu
z4Tb{|jR^s{ree7oM-*v}vPHRA3123S#fPM^0{Fg0i86Kvyr<v3;vR|&i|**LXjqB5
z_da*T?KyhH{kl)-f>jQ=#go2}Hz4_j#nFiLwp*6`ZmhyfmMZNY8PlpU>c!LbVoI#B
z7+!Xll5o_QTj186M!BsvYwdE)#_6FXGe4--w5Mv?(>3imL~UqWY+Y{Ha(*mX|Lb?o
zzcbC|7#Jf20v=+PZ!jp1t%p5+{CK_LtiASJ?_{sK`6t)pg*}(Yu8si;_4bh7rtF8)
z_QR_EaHg&y(ffGW!RHWdtu2COP~e&LsEzIOKLP+XW$#GaJ5+l|#$L1Rs7pJx6}bs!
zh`hS6S#5k3>f&L&P1!rs_D<E_38FCJ$2kscK`VbBw(3l6ryHH$xR<$X2jmO(Me|Jt
zkR|Fx^R4Fr`J21_wJPTC+gn~UG5;`efWr-edsh;WIZ#9b@@+t7&>)21-&5mMoa#nD
z-!V?d={Pz`NxTBoLP@R(*@ZC#c{ff&XsHNX6y(-^G}R`s5Z<|Bx;sh9HXJ@n$E2se
zgQc<zu3ONPSM*buk>QnV6VroBH&<q_XQ$_~Q}1P`uVnxF-s;@D*=sW^pMPQS8%Sjc
z!y5+?gQP-GWpH92F*@tUE#2zMh4)uJN*J6jR<oRlin0;}FE;y!B0-1}n#mWIN5PRz
zBM=xyl-TCigR%^9gqOq<(VzsOBM=lr5Frj~tRkLLq^+=8jSKmXiJ@3)fw>Kn+IR>W
zZG_T>gGb)P5f?2vv5<MxCxt<^i;=eAzN28mOH9Zbr?>cSz!D{Va9S-mdk~yhB%bWW
zdlGwkjXPJ8w}Y5`767>PCbZ=sr8XUans8(`w&oCHeVTy8fsCVmwp(@ZP?w#XW~x7|
zR+~E)w*!Eha(1PiU8=Jy)9}=EbUK>jkh3LcLH63&`q|)xX0>W3)cHe;dv05oUQu`T
zK*c`QH`%9d=}g(X()KRZ-j%7_JiYgxqb{L5f(ar2ntY0I2qbP0fXh9D;wXt(DoW*x
zV-P|1#gh3_{zd>d>U#?;jkDt%XkBZwAcEpt>H9(26laoabQ+8moD`aRPBGpB@QT6i
zTu}rn&O+{|FEXWeK>Cdm={4`6q7s2o9K~L|Ob6&?OXy_*0O;Jsvh~6H2ttC(6rBgX
zhD+q1lQ+Ly>$L{eKq_4mCocTp{J&loy3y?P2Tyun#J2o6Ozt<2vlvO>yiWnw62wz{
z%(jb{-wuw9fJen_Kh1jyWO@r9i7I>~z{gB_&Uaa)CRnv*6~obCQ379*cHq9rHv(y0
zAkHamgAu6)gIWMClg9W%GD*`?17^07i4Z5$s337*8J`_EDiqyUH<8@x3k%*7uZ*><
z@z?cPUII<>*8qsmI_vH_TJAVnk~>q5XVQ*m5^QE;L&BDA;PnKYL#)+9>G1tSiOz(Q
zan#;*Y`^2!o)mNu@bL{ED&uUr>ukB>Y)Q7JoNZ}mn@Vc7;O;$;^qPD0nsWB0oxLil
zSxfb~s>v#~zV{Dx$$?AFpEUyt_13mU;ihZJrS|})ES|K*qgp%}N26+Kl-ghkeXeTF
z;`N5%ed6_MHm`R$B1A(Nw|l)OqP|d`V)lB4h#wEjicbmphec&5B4~`PND}r$3BQLV
z>|OftvBx5jkbZJi{*X_WbpdHNa9l>jG&u$dKgJ2bi2P3gr;RVecBpK}J+3B4Z{})~
zbvXpheEqzz@WLXqu<vR(?fM}ZD}sj%GSyDJMFett0MF6&Tz#@W>7RcgIrQ1q9D?pb
zH))E%kOE?E7czmtdah~Sk2#ta1c;l=?f%=ArI*v~J!Gs1uxQQ1&xk;AR9_%olFKGl
zL!+7T8x(H8W>Utm7nJRqW*rn24|%3aZs6DNIoX^J+-<`E=a_FTFX>10vh)M!;-`lE
zO91f7qo@q3I!(R|sy<D=4Emn><a*EAlxRp>n`ZZ=tgUHl>uFPt?V+fq99kbfB;(&a
Ifl4O%KU0TCSpWb4

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/data_loader.cpython-312.pyc b/eval_framework/src/__pycache__/data_loader.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5414c43dc9dbb93dcfd81190714b5b1097f72b26
GIT binary patch
literal 3708
zcmbtWTTmO<89sZ{4S@v6T!b%+!ND5a$eH+7>LwK-I0ZtR;OUGt?WnSL1;Rp$cUQq^
zMB{kKRHP&(Z4+=4w{qR_C?yU&xT!pLT9Svp^hIjOm~1>#&lFMoR24Gg8Rw<{*_9T7
z-S(kJqqF}x=R4myXV3rr|Nf=C+=8I|`Nolfb7qA8jvK{77mW2EfiaCZ#1UcS)t-cx
zfHoN>hbS*KM0@EW#>)_hL_9HQEfQ=zdE+dhwY(-0oj@EljyQUjEcBU22^Vu8H+o%^
zLidCuS)n^Z0a>9uk<oyL(rQzov7P{98u18k#7lC7m*Q!T<Qb0Q4ZM+OcvFz92MqoP
zX^tMJXA4ukW<WBf7?@)LjG+Wm##{djo3R90K5BBA6-$R-_V);Wju)c`V4;<_=U3kN
zF!y?L_2Z9oms0<{J-K@0*Q+<K7J7a*dH;8lt8=evFvS!Mh52E>JfIjvUXF^9K;d<%
zTQ)*X!GC=idZ&>Y$@*v<E`=jMpl0znl*4F@kP9|KgT*V<2Gu%F6t7~8<lwz*Tmyk*
zLlHO#&aky`j9gaidxb2ryo)$P9KDM!)5RVF#mPR@gq}naF+vjP6e8d^={`rDLL)>I
zl641ZMxt}ms<tBk#*;Iu?D~2Hx`5<@<G=~+8_1#stc_z-$ypx9YQn6XIZkcf`2}#x
zG3v+E2r+`R{kkmCz3{p%Y@saQdb6wVyS*##{ABsoN7`*_m#8Cj!z1=fZF&pOFJ5yS
zyTP&ghq>Gz$2Y<B#!IR*Y$4GW{l_oI3qF>ZUcHjir{HDu#!LQ?#BZ7239i;F$elto
z!aXO7g4o7thgnX)k-I*Vdv7z>pd>_E@oqc#fWYyE8S3_vw{y2I-v7m=)wwx+MTHP!
zOjHgYIOZZ06CZ&&p-6vk%+VJO1b9gbM#JGzHjIVBu`w$flKCNt4T{1L8?)oBu@OOL
zgLrdoY-}4|hvn3jSsrhe4Mo@(*~Wt8XknO-C^X)nLSsh75ElCTc~PN5k)WVZVHmD_
z<hTl>5-GMVhjP&(4lW9fFG-;}(6d7D5|$8trw^ZT4+ulNJ9;`Ak)!Uz2T!_tPMvhW
z;FsIslml-5Wq;Tg6#YZ|h#;PEOJcx{DSYZt#n$0br2?Biyj>q2iPdj}jMmLVI3`Qi
zp*oN9J1yp_yOrCf29i{&f3fmN9ueg&>E^q&4b!7<jV2vGe`Sh(KtbQKwd!K`M0dv7
z`hhfmXm&JpCcS-OV8NTI``)7U#ii=nME3(D;8cRIzcL|v;~FA?9Om^*^kf?LrJhf<
zW$KSCT8}PSEAQH>@7S8Mwx-RXgY$Lsmh|rQi|J=G_1|8!dI0)$o&lbR5{`QpDm*Nj
zs_{6eYX`vk#KCxW7(b~uVw*kGewg~yvAz8u_31$dY`7@6{!Out0S-0R{R)g})DQ9P
zH^dkbCpyp%S%k*OI2lJT6B03Lk^!NWK|SIXdk`e2I0@P=i5mnOrD)JM#7qWf1z<7a
z*i~$aHUEQAPhyw}NB!tkpwV5wG@{?Xa&u*7E;n^8H~Cs_YBu+$*H*8+n!7%|^5O4w
zyly-iC-7?lS*ke%sB|QDM7{CY;GnI#^4`SC+X)@%q7~EGC@;w&kcu%dAcR1QioqX|
zMnElwVNl<&9IJaFBuOAvR^-n{Ln7!i7~;bq!%l^k`E#<k6HcozVgG499BVFuTkuc|
z3(9O!M_RB#3rn1Iks=PzigCm*Mu6Hyi4C}ID=y(vmyi@jd*XY9BZ~NXlqyhE2$aIB
za`EAJ65COtnMa5Bz%+@2>O5MjK@FWm!nU-nc@0tK9f{|bY>t_ZzuMTl_L}L6w<<D?
z-$?IC*JgHk7VYg{)Naok0hPBPYt@WnCUj|6wrp?eMEcNz`Q|g({asijy%W8e2G63k
zeW|+co~?Qv)<FS16F~!q57l>{Q1f&g4eEIsW3^J6fBO;SDcyJDy03z-dl32oaF5Yr
z4Deu8Y$<VOw!kh{#i>F3jj*L`ARzm?<l>hT8q~A+#srRt(~u9z%T$pS<~&QIl3C>&
z=%U3@vYuPx`b^-VOZxRwK|ga@-!*(CP<_Z{h^Bx|_te7Y(an!a#iWnw#-obCK^uQS
z{a=|?TQoQI=6_1Av4-zNRJUVs1Xq(6>zBdfIsP0Qv+QT3Goj&OaP*kv`vvEPEN5bc
zsLTpMHfGTBoy#OT;na$mk3@%f5nNn+0=EtRVMuA5su{5fdc;axRYB!4C{!RUsVPdV
z#bArN5mg%s5mI#};qN!l4-`#8HELe*M>yXmvBVlnWb}B*abS|Bpn?=sgY1wC8nd>>
z<lfJ0`x5lhw%UYcxt7J*XARNj9%AA|!jq7fY*lw`d$P7YNp8`$Z^`bwV{gpb8<WkS
z+gp~4vOA7*M>>|-)%Cf(J8#Cbzp^5;?PA$P+0@`~tCQZVyE5#P^L4kI(%daqx?!O+
z<LF*A^(@)yGp2g61vb^|s!&WmUj(wP&!<>?z9E5&hOuq+`OZfD;lhZ~=i`I`?v`Y~
z910Ba@_@i8lq8EH)}2V>3ad-I5c;$rgtfaU3xxfWq)iZ80r5UkmsVe_h~=hM*pPk-
z)j0ZsK6;O-%#(FYRkAvd;E{5qxcR5j)cmpcBU#tC)^OiCg$z|HD1S(`9E>CBNCr|*
zB?qqT%p+*dcdX$@0SQ)qKQaKZgK?$;7~!1fATm)4eG8_WPiK#GVYm~+EB}Q-+CK5l
zTo$nc9*Plv7=c?V26+^_=a$1O=6+u8QNNu;H{dk~RL^)6H(PLpWAa1OaqUe%Eq)W4
zH~>j_LY^WB;!9NaC93!v+MPwa@0p#6+B;@v*6f@)_L;f)yx{@eMG($4glm=X-#c5%
Awg3PC

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/evaluator.cpython-311.pyc b/eval_framework/src/__pycache__/evaluator.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2ac9afd4c98d8fcdf945ad0642ef8892e659b09e
GIT binary patch
literal 5717
zcmbVQYitu&7QW+oc>IdvkU$8K7!n>qtpjbzLV(ahcvPZ5fnC7c$~B$|F&#gc8Iuq>
z+pLO;OM{>-EGdP>t+K!_RYBTTO-olxTPk(`jnPP0vqI`>>jZwS5v+vxwdc;*<42&Y
z-RsGj`#AT^z2}_!opW>1<+39vw$D5JHaQXcCvFrARjAw_fy!ybA&wA`-*^*#3$!f)
z8KL}Cg!a=B#?M5oek*~;Nr8>9eirH!M+>%y-EY_HjNphk{Z0Z|5Rdk|iXY-@E)WP^
zhCieD-4?VFan{3#V=q`>1pW#mwH&bnwcGCE=9YF!qc(>nS)+QwAz33|h#u1Dor$=>
zw-XvAAK)T)u}o;#HgA4uoe<`uGBi9d@v;~WN$Y|_C?N#pm>4po8x+)_m-{#1bs84x
zCpZM#CHy33;VF*f>0XMX4pV-Hqj@V&^K7qWW?>RS9K+i<>(u^UhGSulovY#OykqKk
znFNO}!Kw4O%4PrjymC(Xn)5L2ui;#vF1Kb|e=sN{@KzITu+lqM4*&G^)!YY{{`K`p
z?%44=SKpjC_1RD3e=_<fKL73HrFS&DAVdNoU1^$2IwZ+_BoG&4k+>Wp&D|m}B|^A;
ze-n_?C`lwyz#L9mpk>-srp*C#0jihHCUBWDbqiY(@4%qlTPV5Ix!lEfawESpS8fiO
zl^4W)QkU7n6H6QD@`3~8KK*zi^~L1nD-}!?JL~dJ{`uPEweu6lPfU(|b?3@4lNFpz
z#11RgV1f^gRu?h+C=fCosh!j)i5GfHtP<WAtmV<_j=org??~)TMCC-svL!n^Ht*Tl
z@lsG;%SS_f9XuE)&?^Qbe1A-Qy+aa1dY5i9U;L291_I$|SPldR>r7kvihX!Z(n27^
zC|`?a%}-Gy?(Ce!DN1FVZqIu*GjEwXZy9d2DEf0BSdm7B!K4};j&l5f#)!O}5Tg~Q
zwE~xJ+TCD6Y+8;edFc=d?Q*{-%f_gxF{*5ghB3OV%|M$ewbOWQQ%a?~3|TpA+H|XO
zW;kQ0+|817lB$G|o+{U?gq|TbNqz{>lVpL~lFWzb96e-9qWz|Ori-vx{#lg7Jf?eB
zesRbS$Dm4ENo&&1+0W8&Ivt76K$Pidxv|5OC*H~(I|b));?&iNW1rsr=u30dtW5QG
zY|Sq3)G7ER=r%4Joc!$L+}Ec}VVK2iY}Xy3>L?kjn!NV@#P83VoEm!|!Ao*D7S*hw
zzF0WKO8~q4fUMC%a4#=tjDZ&#6O2mzy!eRlhRUtRz`m!!-+hcpr_nx?M6bGD0YDid
zh9JBo39tc?w15q$cP%3|pexHGV$q<b-TOxbC(YtpqLIR0K_le>xp>aRI*|3qzDj_&
z5CX939$Z{HFR&a~qZGiZ@~skLaB!t=s1$jhg1OSuK!%Yrf4A{`xqSTD>!h-v^HM7i
zc&<(}-X@1w4>JZWK5siMPKT<9omp(cq#1|-q=p9C8I6`8_{248e=Ho;$iZ-2#|q84
zkCy`?FU4a~iPu4<-649gj0Q|fa%bV@Y)s|=(cJt%oDa!77cjn{d3s}FBq-yaqD|}x
z5DD-RPsk!h3CRHj+q_V8ooUW;?F>$Tyw>opq?<qhX{Mw4#!<UEtu@7F-P2CIIP%8m
zLDjt=MP+O2PwYAAPFvO5MJYP#X*kh$vN^?Mo0>-l&iZaO`EEA(GEGa=rX?wNo<VcG
z=Qo|%WT^9d>4#;?_FrW-?ov1Ig09T`-73)8d(_!`6n18|(W^8(kDu1(jLJA;yi<0N
zG`;C$;8w%Zn+;2^E*|I9?roWdm(_-sjj>rUIXh$4=!+-c$WEW1ZEDLqm|A-t8N_DS
z&3G4|VU6N~a!_+bf&&4J#=Iz1;35k=IxcPjdITM@W8WPjW#}%JC!mw=&79>NS&2GC
z1!l<@cMBy+RC03UCsla13=L5-UQwZxP$-Sh*OZd9d_q^sT!xO?Q}z_<wQ$T~R(6-B
zRDZo1JBR2ee7S5rWzwp?+k;*`I>gvel1bZ2T%}qy<|oOto~)KQj9SuKxgP>0sWf)+
zDTSNDIkL7D$&Dq(HYBbZVk<0Zrl}<^$nsbmA<kL;|1a->_%g))h8=m+c=r?iXn@><
z`UxA1;9q4FAyCxOik2dY>bDqt1lofL`2BC(phcW6OeviGF#5z%y`Q(ye$yY)wNoZR
zN3zP7hinzvE600tI_IqZ4#ZEg*8&LdO8gDj%n(z^NR2r7;ag_ES(zFK@!2&)UD*^2
zc4a3TSk4aT#xCZ*_`^?sIXn5)Um@p(;AslR>}KP!F|n|=ks}l7Bf0lR?p(PrdF`#L
z5IHz=ha42;aCDy+XLyj#LA3Pthvh!+AnPrF%V6y`BQRk$9Gj$ei$=>aIVb=W@lvRr
z5T609R0K~95_<6529;Bm(<qF?=e%P0e49)||J3p}wcj)eq$R`!fUwJEa+4(9w~Vyz
zK;k@@35#H)#NJ3)7CUg07F*$^5n(+fX)Hg$hZ1n5(8H93ffH0VJY0;x9Nfu_2)v*a
zOA1Qh%c8Hzw;3czYk@q3;DX53x)S+EW6LNp`dpqw4RcO!eQ#^JM{QenRUUsiQ@=*7
zUvu4)CoA}MEEh2f4bE6EiZRh!2yR6UcX_u|XcOOJbZz4F0a9H^JW44kPCT|4Az0T7
zp%DWT)}q(gE4xYvM)q>S?!mU7-D!dA6__buBpXKQrRiN`UBgIUFI-K@*ZD((a|=SS
zr(gvNJOp#ElnCM6r8gl6huS?FBgFRY<3&i~qY&`1Ge9oHM>TsW77Zmt5w5zLwKoAd
zJtSpyJNraF$Zd<og!KiZX*NEZi11=i<~3(f3YZ_lHvuUj$dI(f<9w9Um_!_o6c2aE
zfe;_pvpt%}I)$rFOq4~83Yr_wPsn_ra3?AzFH2!i_g&3uh=mL>zBedpZewi$IR=S|
zegQE$#En=S!E4rnByk0F=o!mv;9f>~-pE+kqBoV_iHdz)c-nJd*N51hrlV;MDQk|c
zIqo>>Q0muT#l(0r>;{$Hps*WqF3&k~wC4lcS(~z`OL48txK^sJm3dTaTR1M?_RLZ}
zt+za#H$9!#S1O*)jAyIr*_uag*Ft5-uk%!e1D0Gj*ZS0@ogb|kTa%}O;~w_w2bi32
z|J#XL+STSIx0+YqY+gMc%QSCMo44eV)w571CpP3QOvA$L+=UnCd_3o)ma&#mCfnF@
zzUE9#`e3Hfr#AYOMqjpZ_Ibw{NBV^iYR=Y-!tdejS#5b_X#gQba=Wqlyz`7xnZI`Y
z01$XGjqB9LbxPy9DbukCUChIW5A!rgz;6d$lLvOHZ&xSx@WvltQtEqL4Nrd@7YSO`
zRjZSGSji7CDfK<Brb~Ma*8*g#J?=c}RAxR6Ps*8LeJblySRZ7C(`Kl3?YHVy+^kzM
zzA#g_R;^oG<_f>au*+3;xxy~bx~Gp?6*s;kORs_Vd_&A#?;^jM*WJ@ee>=B(otOEp
zg@F2ZUVG01+jonf>6vT0F`I^-8*^#sxv_x6^~Fw@b)%E&>9XDEB5{2+f$QD4{=JJ@
z@3DSgPXOHlu)YMo1OK0a+kF(S{(*pI4+J7HE+OE$GY~kC2nvM}YaqbILV<vYlO>IW
zYlMgqOQbN-(^fJZHK1C=fG*<nPosN<SWp&s;tmX61~Oul6Y)Q~bb@mw>F+>>&0m)C
zDU>fy+3C*oiadhX_zwM5O&XhNd%8Xyy7X+iZ>%|wpgB(Juj|zH9Y3^Q-+iM-Z&s7%
ziAA(GkARHsOt+<_OY71DV?K4xQoZjw$bg|A*285$+2vPW2`amS>h`?~*QbOdO7wst
z9aN$R)o{Pc4Jg|O)Xj(V2~Q-(zVVg;JgowNCxim6u7sAp0$7S`fk$KHLvdg*6lrQ$
zSE&xR#y@zlCBz+==>HJ1!5wb14$I;%apxLLbS+nE{t>&%uyMEe8nEC<B7F-aPZ9)?
zMK#0vFN<a>#V3o}mH+l+QG-(X{D|f%l}{EeP%57vZB3~es;y~sdB)bJ+S-PhJhh7;
Rz%q}g_w@b;PZ-s;{2y|Tj6MJW

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/evaluator.cpython-312.pyc b/eval_framework/src/__pycache__/evaluator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5e2a0e8e4c1975dc2ecd23a5c811eb156211658
GIT binary patch
literal 4719
zcmai1eQZ<L6~E8#!|&Nn9OsJ=NMcAra5oYtO$sRuEs&H-Kq;WnxUDS5?<H~i{K0$A
z<U{t#G}O9kP(nLK5HL*Jt=UqQXlvE9bXw?_I!%-8B~`N5R@*d^z+W>B5^MI)&VA2*
z4xv@A<@<5&x#ymH&pp3$-e2<aEC>qsZd3PO8$$oagKVKPoy9nG&LST1goxaFOSnxi
zHi=|_a#I1?O$Qh^6EM5Y1fC~FHo&@B=u<o`asi9mqV*Zk8nC%-1TrB3^^|8f%C$oN
z7@?2cb`yFG@#d3=XU9w#T0uYI;%?%>0T-oG`~0$^QhWS9MI|2%_N(-fNLUnHgi0yL
z_`ppp2?pH0efxKdej%v9z_DLYB)?DI?G=3y(W`_cpDtUk;R>{}_zE;<K}k2kBRDGI
zCV7)U@uWa^Qap8%ax*+Fm<3v3J58&BmuCcyH!r7jGCT`>7Cw*X1ncq{jo+g2+cbKf
zF5_M<oeq+$Cuw(nzs;4ea;?X`VgxH5seyfOTse9D!|}OuqyPDEaPHL5jq#t)pLyr{
z#P4+a{JTF}7=2l_h+@Fw)2=|xllx^w2zbI$C=gbBq;_1r!ncgai!D%{MNuM(Jo*gO
zQ4@?Zwn0dtvlC;P4$K;|z)Ggb%P?VaX3AXr;@q1r&kg?4(AStUI*&?Sa<ehQ3v+5W
zJHd!^Z@)Gle{W&z%3Vx%ou}Ek@Q14lS1-*E4KKX$!Hp}Y3|4qBQXZ%z<>LxJdbKd1
zzXgxacnmI5rDQ?u%(2S&RG`b!!wub`fY1=>hy;~LL*wQn4g1=UH0<{(yM&;xyFmc6
zcseC-K<Eug&o#)BPa|oTst@<8tjFUI`W267pwO^jeU^rGk}IJ)fzkzNO>LYSw9i)D
z7pJD#@~_r!om$@*r@mmT7J&n)lppM;(*7VX^r?&_C=n@mmmTrpG&{C|y|5j50zbJJ
z2|fL>lG>Ou#*{XuVN7d126_zc^%xe1L8fFa9y9ai9>eC@+2wB|N=K=z@9sfE>=5T8
zyO7WHEEwV$cx^E@#_?>FJ&$-U%AH3Ssne!GJB55?jEk{jnZ09{C<z?qs0C_ElmR|E
zW{sjACbws{*TZFdxi*S<HI&QIa4)M_%m#1UxSOasYU8aJ=)<5Vk<p|VxonYlV2SaX
z=H58DF#N*YsWaee^Jm8APrZHf^83cD(Yg0YW7{=5?_GkQyJpd4lMC;>HuvEfLl|Z;
z28T4WukaULtA(qt&j0Fy!Kt#xB7&^=LqXN->kj#Sf(-a5^eHMWdOHMBWppS|8E;VT
z6{ID?xkPLnFu9xqrh2D~mP+B^5`GL425bpGQyo!seix(CGGJ<0rFugCph^z-!<zf3
zwk|>ONP-*=1!Y0=J6E3Mz%oj3m6N*$57*+V0xH!m^o0eVBJdu42bB~%M3q!PWk+X7
z3V0PfmpzLm0>Z(!!VyJU2g_yb<@m0QA{An(HaI4nUG`uT43y@aa5){HQ2qicaKutn
zTsC4!6;;RCS$om&<1alway(_<5T|Afiig`z+Y{zg!Nxc}>nIuSK3y4SX3Hx_`p(tQ
zl-H-q>!->$$L(nbRX8v0J-7E0`SZr)p`T1Wc66rpsrR={Y@H-WiP7rOlF^RwqH+3f
z8=9tSpPE|Ro@7_gmN|z95{_i`1IhJ`<A)}g@xElqBXJAJDlI?jIqjJ#*^(;RGJf9#
zKhttB)pBsE<Vk&Y4J@5qwPxh;)6dVA*3OpK+_EwSmNa5G%eRYpXhS<8V~@GU7VfIB
zs1)y2ys9<e?ek#35+wO946y>Ih9UPueHL{AdMH5B%XfJ_Pee`NpQelCSjKOOWl=&g
zXV*rF>^x7tWuQYNict!i<@qje?ZIYUQKF`|=&Ur5Kr8kvLv9zq9cIYvBmEe6sx!F5
z#LQ8;2S3FX1&{(imK6msff>snU6gs%G+0Yv>;fr2pfi5wD@xSBwLR$BM!-OhLs710
z)e1Hk8aNC1*Mt3UMTuInXD3jU>cLL5qVP<PWAH%=aga;lUl#cwCtERwov;tBC_Hx(
zy|sKd)U$pC3k-W|@1;eJ*ATP)!27Z0bdB1gWT(kMU~`P=Mb${b2%(i4ByEjZm&vQj
z&g5E%7gmTAre#up@Q&KyK4R9NS_l8Cx4lF!>Lc4xFHw!QAerceh|rD*_zhnC9{OdS
z4)Nm8po=AoL78++j}+%$Fz80--bjJ{(+CF3mSB;~ycSq4oS1v#&AIn}ef^IY7C!hB
zBzzEF48fS)7<^~c$mlls;(X%8xmN~nT=~ty)fZL-k%85R6|bcDgI!LX=s;Ej@x<Bd
zSGt`8tTO{30|f_lh{bFeWo4I1rInE46#;Gp+2<mpO>ip`cJzT&t&$XyoEh}ZE@^fi
z@MZ8rvn9?+i{QVAGjqsg;2Q+H!eQhx9nd1kZ*huG^vQUUQ$zYCc8BN<bnxDmftno3
zXVezzcZea#_v9An=v>dhN*U(|Cs1Nb;^^h(6G$qCA;=}46Z!|%WrW}p`ygo+1qeS*
zIpV|7q%$Il{jMUF5kp;Ff&|G~5JD72JBV6BP__6%L0?3Y;Oj#*cSayrgYa8;q+1fa
z{J~I2Z0!?#5lH4$P6$Q<g5*^Mm60PI0ly;s2urbfWsk8dPWR-9s6ea^hlL=oGLbO6
z69K*o`g}rI3-Yvp?-kP6kfcZjScDy~k0^pClkaAu@Q-0qGl*)|#X@2b?(j+~D`d4I
z{93G5?Yin7B?OUH%R6;~v=eVY2&y?F`}?9Vo4x<#>2H?S3_|1xSWt_)MI~|bbvA#<
z`jRzS+AwY%jZN%L7Pn5bZP)W07s!!4=egv@=IOkL(x`x|n^3-TteJ6CrySKYj;55O
zY4V|@qiM?VL>k%i>XL__zRjYVO`}I%-+6~7s|){CRy9J5Jdh?)NyXVGPCt>@GgH%;
zs%acoCZ3!s-Z|+=74J%uz&LEbg`l}@LshPs%FU_D%`=q`rz#(w2u)QU_y(CBb;E6G
z6H`(*yRPodipv$RSB)^UWmT8*&*dkMPnXrtmaV;HJ!efkdNKbN2i$iYpw1l<G)`|o
zT)~j-C0lCsrnqgIt%uNFv}&f%l`3@26mCxyZl9=|D%_R3gyILM*=@7-(h+mgUX^64
zzFph|n*H~7DOy|m4KkI~eN|R@$#%|`tZSJ#HsMRw?wT&!ZS0RV`d^v`wxv63KnLw!
zmUK<<1un_>Bk%X?+VaS2MfNs=zP5hjo*kBtEVa=8X!HH8Cg$VyCg^`$%L30QI~eGH
zYT{a3xKFLF*6rM<573yhoyL?F68fJJbeo0yj4=cMXL)p6sri75xQRa*U3UB-6ojuN
zk4Lq5Jb@4&5pmz<@f?eI#mtP^<KaU-k4M7kyh=iPDPa(hC|qGVD(MgE4kF>WEaAjP
zr8~utSCI~2!u`0yUaUcagwaE*a9$(-0;-ef?^J!7vd~S5?P&zf#NkEUq{}L4OQJa8
z8{L}dzFe6`Fq|N@W|Epb{CV}{Q`hnrG4<9X#75dVawJickVkhX`YzX}Dz>B%&?bQ&
zn$KGy=Sv=SC!g^qk9t#wI+A>M(jQ0$k0s^f$>8ynzc<DAB@gwb_VwS!3v`9Ct}cga
z#)ty2s21iRq&|Zn(stlc8KplA4EhZiSEJp9hCup9EWSc%Kd!WoUJjD-eHvCt2Qje;
zSK9U5ePw{6x^Y^hr-22-tGo@WG)WM|mni>BwE7>&HHBPXqmnta?rXH+pImu-)eKjj
Y;>t(1O>s3Rm^;)_f^dF=aIKx~|FSZv;Q#;t

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/llm_client.cpython-311.pyc b/eval_framework/src/__pycache__/llm_client.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddffac6aaffd28ace520018325f36fb672712ed8
GIT binary patch
literal 3002
zcmZt|{cjY<b@uk-_Rc>0g3bBt19B!La0PRz8c6G;7$+rBB8P;;QN^`owYeSJXZAia
zyH}gjxwR^qS`EZ5A+;kQn3Mn!sJJR>kg3|F${%n>t9G?gq)6rb@k@Iti4^fu-|XJ4
zeTKQ&_c8Cyn>TOXyt&JPfFD7-b7Sw&EiXd<rcAAIRpRat5YtFO3ZtTggP8;ayhrt<
z*#yhbF;?Z$e1d0?2VpcU)Qn*7B?h6Z@Hs6ZdeHMo;f^DPzvKZ4d{v20VO~QqVTG~>
zV~i!foYPa;jI07Jyqv?C-WQTpT^lu2nY+&cF%7FypAsI0Nw7YoG%FnD6dpbSJ}>4K
z5eox;#dn-fcojd4HNe-1xdA~59On|E(gdpqEni>XOV6q)oYC{0U>CUZA3t22_+;to
z_5c3$#PW^v%QyaT=jJ=Tue@+){^FhaQ-p3{g+|fx%DI$u6d$w1H)IV<d7@f;I;&vS
zYS3{yhl#A`3AP&2@~A|xPEuG)x`6@%G~g_JcYg|C8tHD(5Gpc7&o!n#U_~Uk8bua3
zH<U#VIBLmmMIN}?VZx!oHrf3DK;(~BTY2JK`SRQ46Te@(TUY85y<|}9b~!q?y5a69
zgrj`@t(8+B)ro6iaOCIbmoALI{%T*HiXETs=;D<N<qzI2Pn}%4JXa@Q6FAVi`Ehyv
zQhD-Mi|^HIt%)dTogFV<yu9+8lgroNTfBB*t!~UiMA$g-0XP`r35|*+B<<rS^b842
zD$)obCRm(?)q&Ls>l8G4bWQ;8?TZg((>R`gBM)g4-}88Xyzg*-{3Tg`24|8(ar|>x
zl?I5M#v@sBG_H|kTvgMOv+nL3B#b1bGAUh>3LD(?+g%%@E@^uJjH7SDsBQD?kLN~9
zq5TV?{YGg21b3pr+}HugZ$=)Q;0!Tb!zD2c{Vq&ImOrmy0&C>*x)solX%JQ^N3!Xh
zZV8Zzd6G%G`2zYhK6WPl2Qmz<J~PJB3>&7l>~_?K<Z~Z_Q8x>R-3%Odt<AcbSDk7j
zn8nw4#Z&Ys>{*Y(6&Z>LJs@@O#WnsC*t+W48{>-HFqA5%DNK8fM8ouG?KVvPx7!$B
z<X=ONAicFVg3vZZ4vz^%RuKvTgvMAsv?`mWM}J>ayhZj5QpB|p0Nz&=&Y*KbeSu@p
z10&U@M^?=aH`QpxUt|m5_j((Lk<u_G{6ffx4jy4f&?tKtjW7?gKKFUu9-3>`*J7n{
zo%exr?D|#h64;#oC(Ls%;qp75lqc=N@b1a-{AqXGCGM{ZtGU)4T{?XQis$m*-Yx&S
zzBbe)A?W@2*3zx>i<47J7w4C+pLDI$3+6Jf+L^kp2GN>jy9VXCeDP1qH$GaN{>$>;
zPnXYpuoV-MC7oMc19q{t1ZVA-$Kps<#g;dj&FE0@EslB~W9^G>iD|59@*vjiny^BX
zywDDHE}E28H98=tRIEgI^y;)i>Ct;kyuik~qbu|Z94u^p&XMcc=pfdk1Z%l$M#E8g
zK*uCn@I_^3o>mYXA7eqg>JXY47MC27b*mu>_fy664b-4W5iG-8w@884L)lakYnHd_
zD4|!u@*NmWV*BB=1T{N2h>7JLkx3?%8MHV(mBtpYsTk)h-d;H7B{W?~3kCF&Sl;T>
zWQjGeEkVv`BVdTZ8ohZ=(TkEGRd0m-!E5I1uMk=-Uu(xVfu^<>LO6~pA=J`3;We8!
z&TvL}$L*#arKTMfPcYDBMjk%<lo8o!M4tRtWZ&(`KC@%X+3$SW8JkO&!ass;c5a`0
zu@vrsUg6m-p^At)A2p(nR|05L&sWI1X~WkPOo^2s>ey<u$F6i5?YoTjCye%|rv$TY
z^Hkv5a?7R);sRZ(zznv{eD`;aQ{FFvq3NbmO{Ir_cJoO9&_C@r4!rJ=rNmDhVj+0M
z2p%Z~kC-hTGyTTquG=kLrIxNQTEnMboOyHh`S*(pt-FlYU8UAt-_}~i2P*<oiJ<1t
znr(mc81gkwHvYD;wDFa>eq+x8z)oKf4;kX2l6c5$Zkrg@=*dp@KGfTZKI`;9%dwyB
z>}`Ighy6zn56nS$)o9icdNwg1eM~a&)JT%$mn3*!^D3nqCF#w)tX3snNm8;&7-lt{
zSp0yRg@j{M8J#>tE$s&IF(Ml%O;4H7a%=-yW3{&cjJrRR+gajvRs{6Jo(kK{Z?7N#
zv#gD-Xg0d}JsX)@9<epdy+2shv?UJM8s)>YFar>s0q9c)+4O^e<0ADg=C`~w-yzS*
zcHk{RKbE5meK{;43;!SGl=I;GAT|5!t3l!*B#%;H?{F`r++E)jpLW8xk30neS^>0Q
z0;sSI!<eXP-2R!UVch<iC{n8LCTcI$_b+`N6K#gCW9It{zHNqY+qh8So?yZibU*lt
Mj(z<Af$iY`0fnp=X#fBK

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/llm_client.cpython-312.pyc b/eval_framework/src/__pycache__/llm_client.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f212b72e15f4236ffcc82e3b3eb306d65f3d355
GIT binary patch
literal 2900
zcmaJ@eN0=|6~E8#!#0?Y1_Cw?iJ@uBOG=$CB-u!olonbwqO{FOC%mmsuYMP>^Ze2K
zo<~4tLQ}Rel{BPD2T__r*0f1Qv@k7N5t+8A+JE~8cWm}vTdC4wKK_~)A}z{4+qutQ
zK)PPZ_uO;Ox#ygF-tV5{zm%1gAQ*RU?(e^4Md+WD$qz2KSW{pzffS@LD)MD1<70rg
zsFo1xV;Q=~s$7Wo@eHycjDn85ks`&`879m5Y!>taQn*t{;b$zlJ^LUd3agaq5*fqx
za#RmQ!m<i$;pHd}x4r1kDrV=>9JAIAiwW>Z<MCM(#>d)`Ql?lir?Bwj;KyTL5wOr(
zqF7JyKC5DbJv;nLFxM+6rKdQbZO|c>8}`o5m!4AtIIPDSz$$R*-~4)M^poY88~^&l
z*vieRm75>jnR~nKl^5^KU%4~?j$xIf0qF!DG;D9k8kS;2HTY0O!KzWJ<4_b6S&tEH
zl!oM!62UqNV9hV&0;hx~n1$ckF_<Qho=XoxaVBoL&g4@QM^aJ30+&k}=yAYN3+9OP
zz~w(08nlRP>@EmOJh_n`y_CNCR(kAL>u*+&78Y%!Pirl3^la{$tsb~~`o_;!-}!KZ
zIDh%9d}@CA^2qUHof}llth9QTu3b*Q|5keZ^zzl&4f1vA0<F1^)AJvt&-`@h-Hlr7
z@(5b*jij$!UH!%Bl^gFaUBA3uSF{koG9osZx<sDPoEU=N{ItYQB^308Vn`GWPQz+%
z@xUepjef9f0v{aq_D4e48+#)L5b^HY)9vl->Gr-P>(An_zu$|0EUQv4kwbVOLQZ%!
z;`gd*NXq(sDhe=>q(C^JOVZGm0@R+$@6l*95vCDz&xva5r@udYa<S%_QEsd>wY3gN
zNy`2BD7R>Hn)Gir=Nc#@qa>zb0(PS@-6+!rHAtltC6Q26Hv|A(jD-CKgn&N1zKOX`
zKoLA@W|)lwrh?Sgrf~u^h;Bz9Eg-I_5-cEX-QIe1=eC2?<E8-?&9bv`Td~9~3j3Z#
z;o=OXeHPIBnnH4&pUK^5n2U44@{Lv~r!X#<gOv|Uf?FQm8|LHu_mEzbmjrj@S4FBg
zzCK46q1aqvXL9e8L4)OwBE33)ZYLtI4-0Wt5r)bT8fMq|paE_giZ#xHJ=_2FWRD9M
z(Iuhi?Hj+GQ+MYb1h?f^MXHp<*&%o#MQc4sDIFGmB7|*OI=~E|lWY$fV7}3hWt<tS
z8k-g?Vt)tgg=f9;_9y8xW~n)MIz4~BuwPg_TxW_%ZuKmmzXqju<&WpmKigPsHb5No
zes^p6*3{CO@#QP?D>qITtkWl3V2+vZU{h72RnQC#ihKIXZ&z-9xHR$ml|P+NUwpq2
z6A~c}jRgaQJ^(|=`W7t)M<OaVto}$?hjw9bG;)zOONU_#VNH|!ux3`7RSJzE7gSD<
zUshF5uN+XZ;%RErY0c9;4;b$dE4F%8sfVti`scH9J>u!Zx`$vb8VPIIBlqf<c!umA
zIeShU60$Lhm1Z3yfG=cl{(f0EO8xNqRZRP*24#<68O|2U9z@+A3HY&QSaX3ACp~NL
zIO)e`(=`M&($|NHVI7c3I1uhLI6V-;2Cu0YM-AR|ELsVDc!a(uQbi3|b1lxW<)bzP
zIjjwUAqH!-AZN>{hg48?2iy<3V)lMRHN}FsexKr2YlIYzpiB*_svfnb%C}B(NoP~S
z+4Na?Q^rzR)|_(hm^ygjVA8!i;ohBeA4s?lB;AJ-?!&3N?Nhrh?E0!foDC(M?Wu+*
zW{)PEM>0ISy(VKr4UZ)~dlH^KnKHEP$QrV4+cIv;RHC}Zq)SY=#A^)+*ON(COTyKX
zbR9~#4vh<`+WPUbdzHvuamMjW$3kt(Y<F^BM`B;cLd)~F0}ESUS+u>Hs;IqxuL4!q
z-$fk6R#`jw^jXKa^^3}yiSo1M3k|Jv2j;}vPv7oNb{tQ19A9XEZQ-?^1>X-AdVaXD
z<422?ucxZ&CcBgM&58Qv&#IcgsCG^qJ$rQW&FL2|#22fd{O|pY9qc~HxKTw-J`;=O
zP4_igQO>n}>)BoC(_QxVN7zsIwpF(~*grdXV7j0uQiQG2x+~ggD+)uSk|d)<lAt@s
zRJwLZ(wi|^%}K11q(uC%&1yO^_+B*v;9>(|ozTKcnqm4Fkw@s7eo;b;zd6yqt98M2
z3jKrIoe|J?kNlgh;GdXgGYH5v%_MVsO_JLFXod<?`@geBX{OP`JEuVf2vq@cd+$9;
zvaYBXvBa=ai~=^v<G>q&J{YA8?F)twf&YtgAlvrs0H;n9&cq8svXf3`r2FZr5d8z&
zp==rslY=0juU4Zs%&-i@e2&ULN2OmP_m{}^mA!7XHfgU**y|?0{a5?W5#c{v3*)?t
IU^31958d<LjsO4v

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/metrics.cpython-311.pyc b/eval_framework/src/__pycache__/metrics.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ba013348cd92b92b28e7b1b144aaa38687786be
GIT binary patch
literal 5731
zcmb^#TWlN0agXoEiKI<ilvLSb{76hi54)9}*ix0aabXux6d;O`n3h9acqfVW@sYiw
z6A2U`6bgKDl}0v{N=e<6P63;NWmiZMKyB01=vO}aK^g~`IKY5`nzHj%25x{LU!A=p
zj}J+4+jMyx?(ELY&d$!w%<^hOg9AY`etKX`av}5&f|Q%7)Ok1!odv`qmf}%R`=x>g
zzzw`HY6_a7=Ab!h30k7opp_!?M&1^+2klWu&=GY8o$46H(?QyRIQn!$c>&IKg+k~W
ze6<#I8_-e2nx+wJzG8qG_?BioEH#7zmM;l97%(ZOfrua~rX!KCs2C5&#ueL(2{96n
zF?_(NxD%KQM+87aLO6~&g$A7A`4VOu*{AiwCF$yqwS+wU8*~;Bhk_J~V3$<T$eCCJ
zXJ(C@g)=wnJR>I1)N}>#Yx-Ew3bf{0I^eU_@mbdJ**OPm1v)2Z=I9YaWtB<~<dE5h
z>RHgiwnmnVwS%o3%BB}M5l6zpa||C&@{AbA$*rL2gWpfz`)ql2_S|av+=CmJ@84J`
zyq{4Tn3(V;heHCe#6}eh=ENk9h4r%mM&b`f_&z)jU;(9&0M;2oVinD4z0Vp_sP+se
zsT75uNKtA1R6&g1SYxd(q{&KaTCKH*z$0O&rsQoPbofi{-k*P2`1ObPug@2*FWg_c
za&P66dn?yh-<>PG`zsh+y)5aJdgm}66%OebkyB%hL;k|-+l9Gn8vFeZrgh0RJUa4{
zYP&j#B@I59)};LNv)RIxpRay+Y4wxKtCwb0XFe}T%MboK52_EwW1KE)aH3`S<l(`g
zSAP1^@Tomr?au^;Pj;8_FNwL~Z7_h3TLg~5;W5QLisQ+IVkQTf2pBL8B#Jr8h~Y7T
zSl;jdwzqpM9_6}|uP0+-vU~sDm%9f}yxjc)BmRVog~z(NvkV^^!Az8UGmg)63pm^z
z)jX*yF|N3{w?xc@#gJyA3I7@|>#C(73J3!LrckjFdA1c1WpB!ReRDkp&*pi1w&kgn
zZ8^_T*>hAf=bauh-JbV0E%aXOU3}w0|9t;#?-RGYPvpD-*&E2xfxO!*i4Q>@QY@Md
z>wD=jpdem4Wf(=^DTZn2rwQAdR_}X9P`#glkG&6U`l&p)`n%N+=d_c&xALcgwD90^
zMxWI?8~N2CzsCB%_)&?ocE!OpCh&r~4-WuXK%=nnQ;p>HO9Q^P)o%8JVb(W<K1Klw
z6Bz-6V(8kdP)x~rFbN^wI{J8szu|Z^(LeFSwGNPAu)viZq^pmQL+};)p;PAR02zWG
zz!b{5_iN80Ge<30FIq3y!6(Xt*hf~QIsz0pbb^nFpj8uLCXVk{by4W*)h4!ow>E8r
zxeI{mjPz!yKkwZt{VcC8m_Bf|@1wqDV{T`Uyt4<C$hr5+Fk{=V>Y<oeMr6X(;fwgT
z8WM{pgwDCod5E<q!Qm#2De4G1B~6+(a=Mg>r7FR_Hu#<Qd<R!ma;W2SrwnO50E1?F
z18LP->*F(?S_!ong;K`<Ql7h3LM=w2*%`2mSuCeo&?99?8Eex7SncIu(DuCNynDuj
zpPq5!1EO758P)4tGgr6%8FkmVF=bwVAN4OjX!y*Aw5Bz?IW9UYdY!hE`zdoR<)o!r
z$J)1T(yIDNJNO9{4i$f3EtUSHEoB3%_$KWsYr4vI<@Lad_SzL;A5F7Qikm7FB6*if
zEoH5xuEi#;a7GUJIv;sX&zH|h@{CZdb=sD))`!kXT5`jjZQZ$slw<vKNzt`BfhRgQ
zplqn8oE@k+6}7x28&5fEDJLBp&^hbqoGFV)!ceKD94RMjXH83vEAY-=gRj=a%7MXJ
z8_@iqOilFJfTn&e=YO!)dNlQGtwY1om#mX6v9+RA+Fj{adh8l2r(95Yxssm)jh^ka
zVzRot)Ti{$+KfnIs$O7S`fcF`Dczt-D_orY@}ue1Pk*7IP<5@ox4in!4P6u|({yCr
zlB+N`egEn^)e32~TjA{$ol#9Eii78)rF?~n1uzL5iZwhIkAyivG4ssp9KU{!-365I
zQKc+PlPo)9NKm8bEC^vyjAALss11-&4He0loK3)piQ)0%WGS17lWN8kNILQRr_|hr
zw*xY<e{D)5nKvnNp=hZtTPMe!1lEU70{8}!Y|9Dlxp_Qo&g{u_Ex(@iAB1Od-{QWN
zH}q*W&3(UZ;ytkBI{Skw8A~kuf7{<MS*M)<^1?uc8wuk$R4Z2IEQgs<PHC9na6H6D
z&Z;*-QUVhKt72oq;UrYB<JI>>m0Q0jY8bgEYQ3rpr7ERDNMtH1A?ZzQD;<@;bU>_M
zNQS|KgpnlAj|V)sk?3k+5(zHG;>QSLoQNb8qri!ZA(l`~QI3fzMuugfppRpbV-*uA
zn-nvHF*A-y_{TN^NJ>;Je0+42!-^>q8;J)tDaJ4_;2mU!E;|%l>IA5fO6FHFQ$*yV
ziVLpBa4!}^;W*p_1iX*1Imk6x^>zX8Cd;VaNsotAanU3%MijtQaT?}SXU>A4I98nM
zL<zB$yc3gh4+j7UBqNdcwyHh*tX9Uk>wJ3Ru}8Z-5h=r~`VIIv+%1|~Bs%YFmgu4d
zZEY?hv%OCR(viHc<+d+y%NNM(%DlDImGd2xeFvpu)x#;dy)WncvF!V?bnGtObenFw
zMYpAmxBYu>`S&h2<@|?a|Dhb+E7QGMx;Otw3LpYCT3RL7ogKT={PI9<$6<NL;UY5G
z11iYT+ofZ3V|ll)h>Q(^#bfHvb#La?kGCyt`^(Oiu|IZx)_E&%G#faY_iVrI*?r5i
zJI&-gdt}d^toqztf4Jb?I{(ySf6m)3d)p=3*A~>+eBt=~amkV2(gds6A6EgGrLkr4
z;CwtiDmQjXNAt~oxO_FhB!D?{-qW(+y69R=<~+M)&+e@H6pb)P1h)R>NV+B0+99`g
z<hFLoTRSDkU1ECG{)PG-caj}<mUmo^Mi6(tI1M1zazbu7Aw8epa6-4XGe>vGbVru%
zsGLA$42BE7=J~hMbk5f)`#L51PAyurq2|Z#v~^`tE5q5LVD|KxY#X0z<K;H~PV1gb
z>vHOgSF$XZZ5_$Aj>xSecbfd^)=cW=E7{>w*{0WWO|QvKuYF(sqTA+m6cGT)`t=rM
zcfadA@62xbQO?#Q+j_FLp1iwJa;af$FyK~fp->F&;-Qe@2!*0?Hp!Dd9SXgXWcbpI
zH56jwVKOX;j2H>Sn>Q9`A*_fPle$!mJD8*vE$BGIJR=Ai6(%JNHUm)1BYd0@@gNx>
zDN5Br%S5Eo#RQPJBm5n}l>W(^+Onp$yXH+rgU7rj9WEmH$&4)vOHs930bd!BW%Kl~
zN>|(k{i1<3A6R@f9lms`h@hJhm!Df2SGyIUa{)oI6<J`NXDPEU-IFn8hL(3`UR`p@
zJD(~dGPrW&ruQayvqkP3&<3g@5wvOgq{d!7p>33GFR%%F;72jXlF`IC?j?PTIG%t%
z#Wefr)=~|txP>!3{P7a&DhIo|2yG=1n*5_?ALYaWHCp2rK`1^<fNIrROMh$Y(+;^G
zlN3kpbHX10z?(-=d1RkbKY6q{TYmDWBm1Z)kDkkJ{N#~qO8wlmw@5zO-m>^~&b~{w
Z@0zj{O(PTq5o&$-l{)^f4Q5rV{u@d&u*U!Z

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/metrics.cpython-312.pyc b/eval_framework/src/__pycache__/metrics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd3282b66c1911c39ef22ab3fe6e17692fe10856
GIT binary patch
literal 4623
zcmbUkZEO_R@$KF2eXe~6XMFa>;7fqOrN$0T2no0}6jG>4h!he9F3{7myZ&PDBX9Qr
zqdf^#j&ezzVmC?8q$oA5lpHlsD^et@iBf*FQl<X!<siOnkb+d7A-{SUq$>H<dHdzF
zF;UYe?RztCK6c*B?7Z23HZ=GU4ChBr^yM27`WHd0L7Tvx(*P8ZfCNfHabu<84!|9f
zGfBtkBok+nuDC1dj=L#BcS@e5H||aP;=ZIm?$`GyiH)-kB(eh$>jev|*f>EMSUliB
zdyqg6A%U52n0vuND(3o{Ap2vqM(^vF6^-8A&nud9XKGOMJeN`W(<x4hIki9ri+sNf
zXhP=GSkzd+IY}}xPtRrp;tkct>^6}*Zvaq0B8pQ2f)i44r$`G9krAAtOJrJs+CzhF
z^n@u8cLSZVXu##Mxm<NzUePDGf#w$(k?nC1J>&s?veRJffM!pfOyi(8=F?U@D=N63
zmv?azpOrWzjkBvj^46b+W<Qyn8-8tW{Iy$`&)>XUsGcoq4O~h-A>xD#BB@@@g+(Qc
zQ@rU}-DDC|Cp>pf!c;&ilBq#Tu{j1zkL$eVBsf%LMIuM36mD0k0j8eAfCZ-&!vV{m
zbxfAW4j_mnot*)aFKUk2kAGeL-Fr7Ly<NRjxOwsY*{Kg_r!LK%8L6K6Eo{!6&)b|f
zuoL&n+iZ;JS#<Vo(dzK4)sYE9{N}quw&q1LHuAjgsU;ds3*H?vwEX*%;p+RpoO|!w
z+=u7q&b>bO`lr?W<gLHH4W{o;r$k%V{!@{|FYnxc;H77tKYV0k=OaIk9e%mX!oMc2
zh8%W4X1ZjN!+f8{^x||jqcJ3GnV18!z@jlpPT~7x5|3!~+wEO_>7>||J(^7^*{&^*
zzR<Pr&<kD9a>`G{6yMh+9_OS)59X5Mi8MaeC1buTY2;03W>9MsPb!$>m4xA<Q_*{J
zs&g?1F+hG2rqk$lGYYP*gu=7Il?CspcRI3lYIQldC(nH04^~1gh3%u;$6k4B$4qEL
zDYT&+icPbzN+5IxWRd1F-nS_KNZ#oAmjZy~-)ToLIsof{or6Ty7y>n6?(RlsC>}U@
z$B~r?kt`e_T9yne$3`8)O*HW4r~@!lI&=cX9NDuVV8`j!2Y;M<Z^Vf0?9^YX`NFO9
zMVo2^_sg_xQA70q$t+WH@f*LTI${pUf{fI!;1E~}sn7z8Msa2uVDjl}Y=*<KXa#f4
zy}_)fw}XVd8K%>y){NMd`5l$es{GF?YhxdDpYNV@me)T1d0@*JH$oNMqwaUSp9Qu|
zd$xQ7i%d$zSonufZv{@_`1;B~5=4*UQD7>dUX+8_<eVzC8yy*d_|VJ8Q>6up)IBSz
zIp)56dX2RCT&iQac!TsZbx36_Icuem>iiz^E=$f@DKsq6<dkm3u57A9by`&sPQ44E
z!{DJDlXIyI{(<VkF~ti<)yINGb$j+(j%`E;uT<U3S-4PB!G_1~!$a5E`3#K|zxCb$
z5~n3Ybz9sy&w@49my`4AF?>jmA>3^GE#)F}&Zqj|oRF6}zv>+bEKv)n>bEq5t*dA>
zybf$xVeu&>qxGSBEk0|_dBL|VJPr5t@1Z*XdZ8Nl&1wKW3)tgVnUj<ZUwSV(j<7>;
zrYM1iDiEYmu94WgjD+jH5&@84my3pKcPWrj4a;%_tTTZRUS1T7ChPnQtIAs3Ikqlp
z%W*c?oDHf+30pc;R&5a60)5dlK|WU)3JI1zmZ3fH9kj(dz2Bj8F>SHVMKr;8&YcS?
zZPr@{NHtnAgu1LzgLb|4pRg)7KruG^DTw*Zb|GqXJoP=0O|pS|%QmTEc3*Jr_tnd!
z<ACO(`kUde|1dQ7(XR{?nuWP{Cg<L~Y^x%#wUK2zh3d%A%@5wJH^^u?s;^Gjf<`U(
zNm9}*-I#nsn3Pq`&G)7Ic~RCFi90GvaQXN1I1RUfn>#LIu2<9=PKh|35c-entu5&e
z2!mVma6F%d%YCrE@ztg6#@E0|<7+^59kQ-vfsyFcbVdT5T5Vn}Q9hl_WED|}%2^()
zlzXz0G#CrwW@5XG%VflqfL9a5d8$97Ib~7N9I1>(Cq*u$IXOXqUMGz~gGNiFcV{?^
zxj{^dDfW<wluFGdrF(lttkM0co^))5=Hw+AuOSpx!CC9Zli>oyqF_2v25v=6Y95P2
z^H@}^5&C~<{*?ruhPFw@8;B|&`4`Z=EaL~stD)Ax3kDOqf@D@w`U$2hF)+VQISz`_
zSo7-y6LFhOmQ1Rgm7Ml!+|~ipc?pyE)Pwh4*h^SCL7RLK5<|bUTOxV35^l}2H5Xdd
zI*%A{cYb#z9GMBnO5s>>UGe0F&T{yv{NDOKwe(1LIsBvi-WzPo4BK8}+sB<V(ML<s
zM<-j#(QQ}R?UiK-*VyefKfIw9M3J_9<MoHuk4uyL$`9?FN3=IqV%OyNj`V#I2%p`1
zF;qM_vHFv>Q+*$A`Yg7m5?nJATwe;VALp(GH{Mu;)x02kmqks@g{`Ao$99xMkK{df
zU8uSBtrzpYN>j@`a(W+zLp4Xno*GS$_m-MF^Lr|-(RoBQJUqfwf{{YwXyaJ+T5$bs
zCv4Ri;J@ob&28s)k4MUF9pzP<^1d6yNxG+I7n!}uG`q6@;OCJ;S6&=viW~pbIeD}c
zed_GysaHx5JUtybl>a$Q{{x5ZHUCw%W5I|1YitKO<|a61OQdje^yD~O4sXh{*Tb!K
zSj~f4AH3e)SyZPEPalX+4;;JNE?sZiSZteA|MAkaAYN_jx!w{TZ!4--UYb69<Z8<=
zz90APfG6b3yYDt3Z{Upob^mnJV^=+oR|3uX#&70h=z%!(jTSj1lfK|@-J!i3Tz_8`
z+S}pU9}8%nL?Q)05Q&85OC*wMAuAD>O(b5)a*|1LClW%MC)={ZDg8X;bYEJ44_?98
z1v>SLfJtX)lmkC6ak6Z1VRAuX(ywVukCf&V{8NB0AWSYE{aeALanUDI4&*a14WTdS
z_8ZKKnj^?Gjq^1GOR;ZKzL1<J;2kG&t<-6?bvnc{PmCQL=g%IgApncY<gN>Y^8_?G
z0jjkj7wCJMV#4E(7wO`G$+g9U7aB`zx7HBZoZ5XQbVa-pDRu9g*Efjhircz?VJk5+
zwgLxX(U?>=nHj{J3G7k^Gw|PSoZmjJ?iei~ACus3FV$(5fu*(2O==(cY4`Su%09gg
z;ys`gZzYrNnCA#&JL_9dw-KZr_(dR)|0MmFt>&aC>PzJP60Q6Sb(B%ZS7=uW?fMcm
h-tb29;TdnF<c*9yS@y0w?fMVhLs8**L}p#{e*q9_vf%&#

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/utils.cpython-311.pyc b/eval_framework/src/__pycache__/utils.cpython-311.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40fd14fa4ab5cc4f3e73bd59efe64de3538b5983
GIT binary patch
literal 6339
zcmb_geQXow8Nc)0*?0cLi46$_k^&(Znq+~LLI_N>C54W-r3ur>S}ZqrNgVu<-JMID
z+MQd`VJf<UNQcYE>Pam%Bc=YM+h_}P?LVtDX|ki!5}m|^G@Q`Cnu;QIOq%w*?>Wxd
z3EisgweNYqp7(h_?)g2>^WJ+lo0)+0kB>GD{@G3te@B(s#1#v-{{@9p1W)i}km%KN
zvX{hlL$3jHLy!tly%dT1s309;dKp}%gT|1l*A!xVS*SAt#~THczzJ->p>}o_qMznX
zhv{B3zl3M!_wpPb#q}F`^I@jf!dnC@Z$)i932inQY2)p9oLw(f$KwDV2VcWi!Ki9@
zT*c8W?*@sB795SF)BIu>>Ey|7!c+StTCdkbsx0r91vwBBpiDpMmj^utm3}nPC#%$s
z@C&M?FB0w#4ETnjM>UFq92LWTWNEVmVT3XT&+S)$oFZghUkpx|H;hr|q5mS{!RbLF
zPRgYkK@1sYdt&5y7=5uMA#UJl^g9n67fW0u5w@%%S_z3hWO$z1PaGmiVn0E`FJ^$9
zG0~Smp1B@U;kERwt5?5#_4Vnim!^Jo<mM;8$IY1R5C^1oTtb|>O}5wT2i*F@2Zd{g
zOMJNS6g~vqXl+2@{ZoZkFJlgs6z?)oxxZ=i1vp?K+!x^k;Q_CQQt8OB5LW3I{Gp)A
zN&bGp7mWCM4<(vl(JI-m8nvZJ=zw5hcek_*MnXbM^x0@wj<!7bgPxW>`+Hit{PM#f
zcCbZw&L8yki~f*sC?Xzgk;J~1s2m7N&BO5OkZ7MKPjrDi79Q#6KoZ0q7h$$152iL|
zmgKnBJlCpltuqA4tjg}a?sSi?NRxTzO2xS{X`G>;6?b9sHDdY(k-4^oO5c9_?Ly5r
zEY3uiWQ8@Hbhl9-(wj}6kef|n6*Q=}zNjQeLcT~;9*)X=r56r;G(6};ZvO?8;}j7i
zMMDhK7%A8s@OT{v<A#`lr{?<NRLqbD0rL|m7*nobar&OTM&4Ax%ji`(MpY=VF-D6I
z$jscVLnXyO4~-W~9ci?~d5LE$<e6f|m}$g95OFqah#BK-j7{sL&k5ieo*CoH*9rh3
zc=Ozu@Rqr98o_v8;;p=`Tt=LWF=?c$^HPi(vsbLvEIa4eVr<NeR$k5nfOSOw3S1aB
z)1z-sC6C{_cCv8fV-2u6{9%rTU^|Og9RM+=_1e^4f9uF^3ztsc{OG7AUK9M)m1N=k
zD^tHdJN3r#o7c`1l5b6)`Ecr&@7}!n>GWvkt1CzGq(~Yr%tL<R(y^&GKb$^&_#SIk
z%?O}TzZ{5!RimcZD*LQo5`26>RPFw#9PuIJE6M)QaF6Qj4+MoUpxGx2q2VA{<`L_Q
zp$IPo<IO`-Bz)J>u)iVH!223@HFRAtsHS1RD1>DRjLh=^N%ln!s+4~?pfY&OU9v?b
zeI8Q09}d{VsnmcVi%8K#q=hOKjvP{1Nf!Ij3Si7Zv?h}%z{GumYV41QAwReQ;P6;5
zL`63mXVdmEYgv#IiAxc=5A||*ci3HUrJbqf;uti8886O<wyCnS8<s#d2z1cJBvvC~
zt^pw|h;R%=qk!Fz*a}^X;gS9cBtgt9CRjHzHp?d+%W~|p8?L3P4#l-LQ=_;xB)Xt6
z)vGwxCU#A_S5COsjJwz58Xvms%DcBK?(GThWKF|Fjc2^ZlUuj#@=N&|uTta9(e67&
z!s$N#lN0t)dt&zu6HLG`bp_Y5(LE=;quvSE`f=C#Ojq93uDIHh+@yO&x;F1#moz8M
zH{2_d<{2ZwI3{S<IPFUHWFF1Yt~}kU(5*ST6}qZwQq1V`)FH*$IN@v=ceZ4yY<J$d
zMR9IPJU;2DpKvseI~p@XIY(pOu|;ugN$e_E?8%YTuDoM)-qN608gg{QZP-46UYHaO
zA{{_yi4MNZbRw`~)A=_jRQTr#e>+t2ylK7hMYG@p-kcmWlp6$+kagxDM)DL-k1_K+
zzCoUizChAY$z;XAXG7#;8e`9cUS2F2L5Qw6^sAhU=emi7eLw2)n4=L8kD+?=({qLQ
zQ`4WE1aKFU*~hz|@P2m)P&~!<`q?q3Zv7@xg8%H%w%2Qdb#|#oTn>|}<RKBMu!wxy
zm+1I90iVV|m<N2SRv|1!MZxEn`T_y3$B0p=8a4Eajc6Q1tAt7;kgGHw4Glv~uo1Pl
zA%g7aY)GIX!z%fo6MV%*i`${U1M&P02zd7@g5i=iFMAJr6}onUt{bQ8GIUnX(RF#c
zL!mozbVq@+B?nVfdLYL&=ecHuYeo?2$liaQwI1V+H>O(Cd$ap;?AAQHRbjV60Yw5G
zm<baU33Q-HK-V_cTTv>81NUZ?&=#-M?pR5Ewvyhl#`Kwo1ge@sV6p;z(!wg;hNFS9
zkn^4@{heGTbw2|U#W@4^um-(i50zB2!f%&U8J1s4<vieZ4QCB;O1704klHT{6k2x7
zaSfq}4W}3tqyMi!iGo-iP;Py2_~w=D)UhK|=_CIYT#E0dy}meRF8D<T{5#+>V)GTp
zVvpuAh)u8*@rQ_PLZpNu@gb;oikpFYJ)Gvawn9avCGd{ehlvfS&IP21=BzM^M1%x!
z8xRkp$we4at;K1ztBiBDb|@eZq80fykfHa9U63ATMP6|ybUqG`bO#8AnX4wnp4dLR
zU8!FCKA+uuAv_jVns?;ZKdJrm)lVk&++m5uD@LC_abWbogu7|n-IRB?DDIZY>ZLOb
zRco9ffFOrsMj?4H^+={M$8E@S8x(E>LQyA~^<zxts#c}<W`1@#nqzn7*_{fz6AB2F
zoutO6QH0A*5&^TkUC%SWTm<NU8X}=c{j<i}hi%m7Hu_<w>2nteRHb)&JD*VLU}Qj2
znV|5T5LC_DEAc`2pqiiqMLd0_@E61&kAOYXXfdcy7<dER4NG2#0GO1Ai)mdSj4mdC
z5;ZpOmiRw}-Awtc0A1a@m22XSF$&Cv=^Ta41}!56E~cSUSgnV(jK5K6EHNTo0{QGH
zl$V>3az7X&%jPI~*77lBJf#`IjIfm#g8{>sx!hxe|3zO3EYV-UQm~o0b?F+qMW1@N
z@aZY-zKfeNDG$o=+XuUVS*JgKuaLZmxv*WpAth3YS%#_fmBN)bN(@CkneClEbE$Ck
zP2B4pX=_*u{u6(ykm{T28`h!-(xVmeq`Epp)RKPydNuDXE`xojmbqZuH$peVTyw?-
z-0d#l5j}{J9rTC!pdd;R_aeiud1$1o*x-n35EIhr0l3zy7KnHQeUISpB&rEwY`+XD
z&qRlZ1yQwi%c9^9?a~CQmQIM#iWRU<c*@;KE;1|F5~?gZ17tv@3R`58;KRc(d@#V%
ziu+*10eGZ80f8vEieRg9rrJsOkCJ8xULgQj@^aVVt_ga{IK3oIXTbX}$<u8L-Ik-<
zP;^^6YDLlQ#G=OWMU5F-e$ggn(Wb=i>yGN<j~$I4i|6V(GW&sO|Gc9Ef>7F`&`Yk{
z-ER)0qHn)=`o*{6XXAN$lVWe0bkr8C)r$50ymgggU3DFm>(XrYp`3MV-nvz>ZUt*s
zWu7sa7~|IjkZ%x~DJS9v*h|^xrBe7xU?ZUiB46R_Ar#fYl&>%y6}=TVb#ymg*tnE~
z>QUmE2cCv#Ax_2(`q^US!Xb0{XfPA-ZGmUb(C6mOf2>I_kDu4i8v&4}C{h-ThpD3{
z3$JJOej7JqQVz(X_gcVJk8Ct=``&4!YUhu4?Af=YXZI7{`lohw@7vSUJ+k3@i`h0}
zXs;ikn%nx(cQIyI3_zUB1i-UP;$GM!020LnszqDAc3ou9ZCWUggl%}}Y)~Qo6b6mJ
zBfSa)e6*cl%*m!aU8B%7IlAUL&E?E1@^qa-*FiW8)#MrklSJ2z6%B8PPKVx#oQ)v&
z_#h^Ec4^XbeBX7}cKp5x=gM*C%Di*6;#{4xZ-jq_(Q=@9woPH%a`<1E6Pf_QSgAcE
zY*XxnHF?FyfO@L5Fa<?>q8AZlUeJYJ9t;MaEnX1h7ly&Yh{zC#Mnu*jg1+V9OE5TG
zxb@@cUuF8?bDxYu6l@ysL0lm4<T#eO%XYxUI~o+Wi+&J*pd!5n6e0+coFps>{L>Ze
z3H+NRmgma<lSECf{69%7%PsVup{-<9s(XfjEZvh?m60-Q-#LJ*m1Nm!a?K0@B<)J~
zWj3Y<-&ujHf245sg@m(8vWK*i9jVnwSVsmvNkW#@YPhT;%c{suGPOD-rFW)Al%?xt
z2-J}6)AGx_mM=ipE+xTC0ZCQkEV~hBU%<McF}MF2oL7>~cG!4Zno93Wo0a?4&k#`0
L(m1;(!SCwdz~L1q

literal 0
HcmV?d00001

diff --git a/eval_framework/src/__pycache__/utils.cpython-312.pyc b/eval_framework/src/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..487e507d12e013733986882228641bce43f08dd7
GIT binary patch
literal 13397
zcmcgSX>c3YdAq<ajs*zt0&j}oA&C%mSf*v!G%cHwNRtvt*`n-NtZWwIf)pqaV0Qsa
zLO=&iq!_Aci;9v6zG5hC$M8_Lk(o4Re8h=6ZPn>a2QX-pvtg#nRHWpY{(z3wWA#tJ
z?=6lc1-t2Z>PO=3JHK~*_j~_RP{2_T-u~d;Bj=ha>fbSw3Ho&6=4UiTou_z;r(M*b
z{G|tJl2;9?$X7kcz*psB$Js%a#x`n~W?Va{C27W`8`lr&$GJgn+%RY$Wvt6MZX7g1
zTH`3-wGO?bz`>2EDq+wOhS!~B22H$PSqg0oj)HVO2leJV>I<N!0cr|)Bfu<xpNYUO
z7}4=&C@<m*9mRYRmhtq6hA)P)6263tQYw$Fs4azB3vVH{`gE<<!HjLpY*6kC<Ca0|
z@>Q*I7Uk*DbbJN0sNgH}Lx9zN733=DSGfu@&Y$6{p{9zj=WAe$TKKKWj#YP(9jxXX
zR;>+pL|(ZS>T6EWw#LtK5&bq=((!h$!|NP(K$>~f?mc2tOU!QPuvb!dxKBt-uT%75
zGq3%y&jn2-!?4Fa;v5~CfVPrGaCm)!dl)b!8#Ito!|&#Ch|W{qY+V2_iC0}z&tNTt
zZPX;~O{XbpjFC%~ae$sl_nuVo4F1ii<a(M4s0OG;YAYqu$5b?RkfP!5dr)><eULgv
z+gRVT(2}!4So*8TwGTi1{L~9qKYV-nr_Wt^|5v1#L><DY*hW&=Qn9o)tFptjci&n1
z&B<&#lGsm{uIMdySbFRH(y3X}LZXG+tj{~L>Ap8%2OaKV5ASr3_TvtFCLC^wIbk1n
zNqW&f;uvyy?7U4aAn{4`h@_FHBI5SI#BSfe<%nn8vBh`T=l1%xY`g2gmfnL0w)EM(
zI|1yGEskgGuAvdZKJGZ?5sq#Vh2bqeuhS)No;V>H5cH4?PbkApfuGn0QINV-N^z#p
z(Qs?Dd``bLMbT_y?6Jin%lS2D*F@-KQA0@cg&GPk8;XK`U$B|fZBYl^&y;OZzr)_I
z6-ptq0tIl^_`A6uqVp6UJQDCE9Djg*h7z~{q_9L_GpPY!9i&bJC2Fi2r3Vzq14o}m
zh416xo`MEskFUJ{qP*`{X5YH@;=4;{e!3jJu=KO%mqO>SU5Jt<B+7TtnKM=#8Darx
zf5+*j!A`$$^^KSUxTLmEI3>-vhj+L{i6viuQ@_V5msrIK$FOt6>ENwScN)KZg5B%%
zxPhv;C+setqf-z(f=xxXQQ`nx`mjXAb*ojdz(*<@b$Ewlm~haZ@yvnNLS+T~#Jvy&
zsZ=RdToGo^)&}>aG?cDzx-FrtjckZM`ANgQ3+BpjbHdyh*EL?&7f$a<=+`V1R)x1G
z3LE3*rns)@im4)A`B2jIa9sEBO#rv@P?0{#q5cOXa)&A(<RqOrwn;eRu`JCa6gdF(
zvKi`~ROjeEpqk0hI1^9{HZPm$F@~IERhUWV$1+k%QO|3#kk|kdVEqO_IgMKtU?w#I
z%^2iX2cG5Gi`vYrXv$lVX%Wc68lY@C-}j)LpvBc6zrGxrzV@4QOV7P`^_91mKKzM-
zHINqH7}350xWplm`v~et8jsIA;qywm!*<az#5)DaWcPVJL&(TQuYG*tz-`0YgKNjv
z@<VI8*Y>@slC%?c!Qu9bz=w_lOCRzawP__aAeDgUD&WCNYPaW@q!Ycu2*LqmK8ldl
zP+*gZTq=;q%^+U`c5TG2dNL6r6ZF|hNL;!(fDyIO45<r(jL{{d4z~kH!7-F;C2WOO
zo8TvYA0mRzmg;bOqO>_$ktn@8*tcM=2@fXB&B5*kOT)ZnUBa?1zJB{`Y0|PI*ndky
z6<MaAJZB2_{84MUQd)Dq_iXRE{`u0DL}^R3FIn0a(l1!nL@E=O4X3%wmNg*`h=euI
zGo=ZpG<+bsJI<8OF<Y+~Od<dC55<`(VGT?}xC<hQBg8l41|BOFjW}HnObvM*;}a+r
zhj9Zl0Wp|RN*EGADf#IESYQV8^3sbsS3ju^sK%5dq(=&<FJ{@|B$J~YFK#vo0V+ci
z0S4(IlLM?~eOI9g!PzUb?_E9p>!r6}mYJLq$z_50^+=vOzc#B9dkJhj@Z8q5S(SV=
z+u)A<gmy$yFHOGy9hP7FDBF}I?%>W#Z%-}%<d;`pJ}HlX$E<JDTO@kuHr-~Ug(_IK
zjghp11CDOkA!$ZD!noZlpn^xJ!N!s!7w*P_R){1sk<qzjNV?37MLcX=V;ZZgvREy*
zPVWs;hR_a8_rNd1tE;H;`uVcvL|JpRBs!cdyL-N@Em77s+xlB|vTS#7--6jXZ(g4;
zuaA1-=JiSQj$rqq!8GNa-jg)fMSzuWNE%w=ObfwMT#x8;A%)}IaKPcTvIIj?6ruz{
z%1e+~p_XNptTAPK>;qve%Z}wd#-PU07qiF<Y%_;$AiH_yBCAmGRNgGB9wSCa4i2UQ
zpvdtWUVBkDgNz6Qm<D?M%Mb@mozSmV^iJs0Y98nFKo^1&S3Y`e>8<e9_s=c=?A4`E
zY|p^H{;${e()-nB&GgA?40xQGO>DEufE6TgKv)ZdO7t-S<%M8^=yTk<F5t5or<(^p
zD;XVb(I+^D?BcM~*>BSj#F8{J7ZzHuUm-*iGbVaK6Xbp46QKS*h~+ylDuF1Y{mJ~&
zDB7QK45cS6bVL20VVfr)0!6NvV)dblAN0>Nl?kSDo~cVPbx|hfjWcy~O#2mmLFh<W
z9T}a|Z$>SzJyyS{GoI0()=#ewZ;d<_d;C+~1E}e>e}T2vF}f|H#QeLejtcefD%g&C
zt)v|Xap4>m^W#+mi19xiuM}oM4*${^0K;a(D}ZgUsfZ|7<?w6__0LsdCY{eG=reVo
ziK<>zf%=gX^r|~BmhqZ1pvUlxR)v5%z<eW&^#MQy#$J2p<dqL%%Tv!SN1pqW@HTx4
zz#`>7IfLVW!CQa9P`dAC#5zzpx4=|{9*iEwD2vWQ2V}d1oe=li^s<W32^onILE;f@
zL9k(7?-V_0_K901bYb)eM8HD{TtsHcm>yO(8jCWyV@~f8oRM9|_Enrz#`knvVIRP8
z!cRN}5yAJ;im>k7j`@=2L`m~SK6c-Xdwz3AVsl4))6U;+inlzPD0wv4drL=^tvP?_
z?4ff{&s#PnESr*+Eej=8*I9L?CeB!HacKXz!n>mD=k#|YWpvRodzv)j#f_22qJJ^#
z`&8G7w9!Q)adgp09bNQo(Fim7T|?DQR{b%{?lfs7gWcuI5a<ef2v<gUZ;;SPfD%h{
z_@9sKX7F`@8U?E<ke3T*+U%yDdWfP90Rx%@%;xEU%1KW$0cI@Am0|RB2!K+7w$)5p
z<p$oZSJV-^D^mwp&Y;8s^~I2Bllum;iU&<ibubD`E#G<+hCCZ!$FkIygU4sNz$CUn
zp1LaRrp{=FRio6f>M4*TsM7Py(d~c{^prlJIjfp7GSs9N%GrRHXGUp1^e0v)z#D*G
zH2fwt$58Qk7pe#aF2#Phd^))N>glxN^YTeq7F_%A;_?eW2eoLo-D~ecyP7m7(Mknh
zapXXd`C$D?*5%UFsio-|Wq+BtL8DnY1&QrLQ&T1`6c@KSY+&8x0ndO<C#fcQNuTW^
zu}+{@QPTJ(cy!*VL2Z>Z_6hJlKr<0cN)%VeCGD`s<r{a4Hl6S&zyx^&Fp2&Hw2R}V
z(OOaqU?Qma5lIa_B$G^?U=EPp#4CV22!kxNDux9Is`zQKHH2`9pF%hc<4nR&{8xyA
z)b&!zSiWd3Iuke@c>dYAt`;<Nqbaz1p=rx})3!v@wq(=x8x+G<CUn*iH{H8nE(>#U
zQ)5WIpfjA|PIJ@k5q+X$!<?>Vv8FC!j2=u_w<l}v3#miB30>s^NIS!6!}P9CbrmT+
zfVpL&IOCa`(>2q4QePe13B#rHqe=Z5l3&;S#^8m)&n%CGhhwd?8<VYFKN^l(9trkC
zw4gVH`opg9(+OisLccM%^Rl5J*z@HLEmhV{6VC8)9k=rV=Hm?v#8RQ?8y~j|Cx$X=
z;|hBrtC~%L0L(8!S6Iwh@+D-RM`#LVUoR?i#0JWh3Iv)gU|DaL(8rLutqK{2GnN&P
zxdxd!Z&9W!XL&fmwezYP2$LE^t!7oo^aM6mmu6$yNnJq4Gf2D&csia{vMyfpstQ<u
z7Rv%!WK>i2z=raS&+Crv2C1~mNlog3Euyz-RRGq)DOd{^;COwOwQy<HVwmIt2ACyy
zTPBU*UohgxW96$rK{m6ZfIeU(^QvMfNNacwb6)I5LV)EByzydzLRl{GR%dF*YBH(Z
zH=qfCZ<{w?EL14VDjxoF$}(7ru#c+)U&I$@+Dw|@#7#t{GQ)n<uWp-L!DL~eaIEpp
z`hb}?!^}%EGe>XiSBzSk9kmEXEh3{9>nZQr4Ail8nN;o@DCR8}%Q8K|Ph0}?Kz(CX
z2$b;Uup1S*d3??J*Lhnq-E+cgvNTZo6_6`eLdIQI?z4izHfP}GzI=6neT8~0>)TQq
zNn`oi*96Qf+n-RSMHYAL4nOzQP1*ceZ5B=K&6Io5m?fc!5-L`Vo=0`&t7fta!9i-u
zeJtZeul4;3fDo}KTR}<*1+r=R7tbvR-<M@3X+@%MC?Wq>cfx9B3WAr+BkM9Cv+&n~
z7d9<-tX_{5%_nrirh9`Mo~XZR0VjeG0|_c1b>@UCsK}_rN*u~9MT^gIyMzH!HS8G&
zs}A&F_g5>?ewJw5erEHq_>4^jVj0D*fQ%HXj=RL;k`Au~CQcl8fnQkTaF06O4oN#8
ziw)?(iANkDuF>fvpdKvg_S@Y@;W|O|Z_4zvx;<W4AvgeCE(gESDjs!COh6NB=kZ~O
z%j!5j;Ss$4qBKylAw2Gk-?D4qiOjy_ift9L;*XMAV!_enJt1jbo>7?EsH8n+7u=A@
z*#bf@v=g4h2yIdUty4)0PDvhalZdgKvkrv&u+jjhlT|@9&=NLBI6!L1jK3p*_9DRD
zlV`|*E0D6<guL{eHX#{$#&L1Pl`b((_lU=4A*zh*$r&a<mA;rEEHP#<i8wB^xvcUd
zJO!<UBN+WTX`j}2@DxC)l5@m(k&H0+taO)jm?J<req;y?BwU(gBCE)?;^aJOr#U7s
zmJ>2-8xhg*Pjso(k<uZMUV`vLX#6wyi4L#?h=Ej6KJ8ByHw5>BUzBTzv?TSLK~K@@
zpF95j<BK};8S`m#ykh&D?!E<`3Dlp`x_JH8M9J1*?-fh!DwAokqB^`cS<$>uTzXy0
zlxl*#DI>M^t`(XYm)6WKmp4Xw66Kp0t7^lp=>3T`+mcoHg2A<!3mK=MOz75x{h#PI
zq|^+{ffA-OgdRw0%ft1bXsZ`2<soiiz3mPE1^;K|JrQ+mV7528t@~wlyu2r5gy?cv
z!wrho-y7;$sBesPU8tI`Z%x#<ChNC__DnzXiE&Lz4YjbCb$7vFhZ4F5^n57oyN;ZF
zH@z`JN9$+TeyV#I&ASddrK3ul=1W=<B`wiMVoxMX+Jm5HR)r5G%(h@RKrJrCi?L);
zeQ?hLW143y3C1#gB;tsTf-O16?0_a^>ts7KIvP8cv^*5-Unr^xKbt7p1Q#-lk?o<c
z6qwIEefsI}o~VDWpe;pd^^elCBbSY3^TxV_u`Z&C-T&LpxUp`|xChRyuw?oHboSMM
zVs2cl-4;8Xti6AlgKL}l!bZ5aiFC#Mmn`wZ#<{|MUzicptqO`YKG*w~z44N!$e~Do
zyr^}Kxo43v$YtP1k8)7<DRa-4H}=w0#UA<#6InYLwH@^BFB%YtJgff(r;@Q8_8VrM
z#g|7u({;lpL(y$<7ogk64|3fs^Y=Oi;~iDqocbSH%exz>f8;t;-DT>3GU_06iDSD<
zw3o`*?mF$IYAu#Dtm#p!|5-&tEFdkeu$K)`K<wo<;InwuGzJB$11N2@LTSSbC_1aO
z(P^bkHA#a74GJzOZiAq>G2m|`ie)}`ItX}ls000A06R4Ar8GbSpD?Ji>bw@V22?{T
zpMMIJH%5L=kW;h4R@I=ynMsRZo_$qmS8rpesY9UDX6@yCc5y(R={>0pXtS0s4VJFP
z_W}SX(r_vC%F_Gq$!6~Ii$4Ot`fd<uUu)X(BW>2D7hhj`<9S)Yuk3(4i9o*77cbkg
zWRN@TS6OJP$DQsjkZQ<6k9fco1ve>}q##dhjDVNMlBSRN8YPxokxC3oJFrfPP+^cq
z5-BJg#I|aeXIN4@L3*R`5FUpJnj%&awk_DPkul3y(X@RQi&!iYWj8;%mF~1(g<}W=
zqib0IONii7s*L(N(-2u{S+MtVvGoSUvF4EGa$)iGy=S@e=CkIAZ<d`lCkuCk*vnNl
zuQ$Kce1p>H&7nOD#tNDK7b+VUs;mo@)vuSmR2Jc)U2*H(iON=Fz@3o|iJDExip|&c
zY`y6}DYn3TT@AfcdKd=i9=KD@*_yB`S=0jj$zTDV^yQ7U#2`h-`x$O$1@kfk(F$5a
z{|}+HJ%IO0<5OuQ=Dby)0;h2ml*)7pZUnM82&Mof8WlAPd_KV{u6?sOk5|X}9)Ac*
zDB0^+Fj(0f^qdkNKq&cCTP?5S^%uDr^ua+;%pSQ62{}`hHxN^CIyVAuRP@NLw;*q+
zDOZ|*Ho&t$t<U<fRB-oU&hji(nrEru3SA9cW-sAabedmfc|c~?3NsB}QRL+ea1AbV
zoJmb?MPTw}jpRFF@rAE2d=Z$Y#qccwqg4mr($^IGKh>x7_9;*P1e-QlRKR;IzM}x0
zLpHH|`rOhBvHx$=!SCQ6{1$@0Zs760zK;C|t(&YJy}j1HeY-n*2dumH^*!3LzXvXz
z{QS2tiwFIdmDUaK6yY}WM8Cfp%Hg4alULe4=~{^y6`cHS_2zp<4!j9Bphp~b-XXw^
zA-wK#IY!3eLBR;T7V=_BFCOt6lg*S-I7rytP2WQBSMBT^IADFWbAQ*q{e7ML2dp~}
zTKhT=?C;st=I6hK32Y()%CBy}`!2t#%?gQDNLZV1(>9x4VkZO?GiVVB9*mIV5TQg2
zAGrPFkB{4r3umBMVtwFe#itYKQe7pEWC6q-hSwVMs~$;C;}Z|Zoo^)y$hbopUH<|A
zgeR7ih<9$F>zgQ+3vHfbDi;}joNJk5HbNq_{$&3j4P}eEg1BkzoNgUx`FRZisBW3l
zZIr87ZmM}f2MT8KdG0K?P*lE9RJKrDcFUkKXl_v&Rtq|zQe_9|A@`iFmW)E2uTU^L
zWx8x@d8g{ls?RF-MjB({tSh;F&&!SR%Do{IM3>8&Zcu8_GeM27tPXd*R5f3@AyK&@
zS-BA;;cC$r+OudZntnpINFux9@D@ZBZHo=ZH?++*%nn{s$J=`^b;OJNKQ-=4X%Q^k
z>cD=iKh^)`4Kr2MoA$@Ha=Wd}$2%B^CD!G52JA^rzPcKM7cRuI#g{ZIJhNHhcmy^Y
zUvdP&2L!JcAt#62>!rL3rCO;_g1}4*A@CdiU*9&d8PNis;5?)AppL>-srEHB@vt#D
z6*HeFH+RA5&0fia()u-GA)^!rXa(fr%r4}<GwGzFRUUsUTrKG?W-s_hRUn>G1%UwK
zowgZ#Xv`APaqaEj;H%B^f4lV2dHJ0mDJIcMHxoHg@C1>jSKoVMDKtY`p#aJqbEd&^
z<b$OT&SV>aR|C&JQ!BshBDMYgd)79Cc1oVeioo;NHly`#lapjnMIPJ&qJ0!<<tG9{
z6Rblr<Q{Gffd>f2%3ZXA6YA&CMz|jv>W<jmybBnz%^-VTTCfUthFrD^8!$D2Ee-J8
z$~nA?yu_8X@QTas^$G%-|5Cw#7p`2!yJb+4p$i^^q%&|QO9uOEg!4BTZNVNoTs>Y)
ztbWT<0Nxqj1UyY8E+647XuJ(@PkadypaugcD2{6@7c6^29B>wRe^CB|zImoR!IVds
zD7-u<2g?&GiYm_=&l=B}<_p&+3fD&ql7;sMdlt<l(~mqq8L5sQjP8YJ3U%!Xb9=D+
z3S*EVESfAo9u50m|L#lQePuFf+60dmuNX__jr9p*ebU&th-qRk>tfsIj1Qz(busr}
zS}m&~k6~84P{<PTWtcY-vB1JDDUI${H$1|cROJXtsm<g{b40#E=T&I7IlVTYCL4ga
ztTSn%Rg$kldsTxqnNNFV(=QF~)LyepAonsBU&`1x9~#R9cV)dx`M!V@lV~OJ$et__
z@IT*^g50|&Xdu6?CC^|9Ait&aiH_dKI}Y^h>$mRj9C*C<zyOemzxd8#|AXI1L-V(P
z6U9pSF>H(Q6NtKmpF-RZ)bJ`6Wo^csFb5;F2dnS_n8bRWULZ;|Cq)J#WGJ!&E-jVQ
zn<vDe@-d_jCnVrTvyj^M5Zq8k_C(qf#=GLXpA2@QSl%?pRN!~*$zJ@dJK2X~S-y)r
zHtkI6t-+m8kxvenE97t?%D8o>`Ut+`lk%Sau6_Ly1Jo+o=zd$R&;`>Fkf+N$1Ku9F
zdAlgD7nusa3H2UvIS;4bCV5Ywvsge>6Hq(}crPh$3f?A=x8p)BrnX{)msv0zJSHWN
zktCL=j^v>RF~Mam2|2lJMPI-a@({97Z&LExO7wBw*0lqQ`CN`2!mlBVm@U2uG5DEi
z`uCK9{F{DHmHdIK{R36;Z&X8qYJfC2)ac^yK#GD-<Uq7BDn^?xJbe?hshSeHgO1!2
zRYwP+x(ho}6y{_6Z0j$aH%U6Rp_;CaFp+_1W8}#TrW6I)SnEyvq}t5%#%ODbg3s(O
z@`)e#j{H3|bQ6nH4Mw^>+=N57M;pl})`TOsr)rAnE;`&479*Wuf1+wbio$}}u>76n
zZ<2hfxsEnPo1$WDcl5-YTT&EcXAghm`nBsOCR01<ZkpZ}<|D09CUXBvo)m@oS(p@l
z<NF^^CkA1?Sdu!Rs-?>!#c$MHs7X<fj5WNw@fRC!VluU3BV7{qMs`I@BRwyDFGWE%
zcF#@xTt7rt(VB2cih@rJpu^{qntb91pSp?J>-UwzF5eSTM;?!GiRzXV1^HR%0iR3E
JE&Pz9{2ys?;=KR>

literal 0
HcmV?d00001

diff --git a/eval_framework/src/data_loader.py b/eval_framework/src/data_loader.py
new file mode 100644
index 0000000..9a97abe
--- /dev/null
+++ b/eval_framework/src/data_loader.py
@@ -0,0 +1,81 @@
+import json
+import logging
+from typing import List, Dict, Any
+
+logger = logging.getLogger(__name__)
+
+class DataLoader:
+    """数据加载器，负责加载和验证数据"""
+    
+    @staticmethod
+    def load_json_data(filepath: str) -> List[Dict[str, Any]]:
+        """
+        从JSON文件加载数据
+        
+        Args:
+            filepath: JSON文件路径
+            
+        Returns:
+            加载的数据列表
+            
+        Raises:
+            FileNotFoundError: 文件不存在
+            json.JSONDecodeError: JSON格式错误
+        """
+        try:
+            with open(filepath, 'r', encoding='utf-8') as file:
+                data = json.load(file)
+            logger.info(f"Successfully loaded {len(data)} items from {filepath}")
+            return data
+        except FileNotFoundError:
+            logger.error(f"File not found: {filepath}")
+            raise
+        except json.JSONDecodeError as e:
+            logger.error(f"JSON decode error in {filepath}: {e}")
+            raise
+    
+    @staticmethod
+    def validate_data_item(item: Dict[str, Any]) -> bool:
+        """
+        验证数据项是否包含必要字段
+        
+        Args:
+            item: 数据项
+            
+        Returns:
+            是否有效
+        """
+        required_fields = ['question', 'choices', 'answer', 'prompt']
+        for field in required_fields:
+            if field not in item:
+                logger.warning(f"Missing required field: {field}")
+                return False
+        
+        if 'text' not in item['choices'] or 'label' not in item['choices']:
+            logger.warning("Missing 'text' or 'label' in choices")
+            return False
+            
+        return True
+    
+    @classmethod
+    def load_and_validate_data(cls, filepath: str) -> List[Dict[str, Any]]:
+        """
+        加载并验证数据
+        
+        Args:
+            filepath: JSON文件路径
+            
+        Returns:
+            验证后的数据列表
+        """
+        data = cls.load_json_data(filepath)
+        valid_data = []
+        
+        for i, item in enumerate(data):
+            if cls.validate_data_item(item):
+                valid_data.append(item)
+            else:
+                logger.warning(f"Invalid data item at index {i}, skipping")
+        
+        logger.info(f"Validated {len(valid_data)} out of {len(data)} items")
+        return valid_data
diff --git a/eval_framework/src/evaluator.py b/eval_framework/src/evaluator.py
new file mode 100644
index 0000000..2d4254b
--- /dev/null
+++ b/eval_framework/src/evaluator.py
@@ -0,0 +1,98 @@
+import logging
+import concurrent.futures
+from typing import List, Dict, Any, Tuple
+from tqdm import tqdm
+
+from .llm_client import LLMClient
+from .metrics import MetricsCalculator
+
+logger = logging.getLogger(__name__)
+
+class Evaluator:
+    """评估器，协调整个评估流程"""
+    
+    def __init__(self, llm_client: LLMClient, system_prompt: str):
+        """
+        初始化评估器
+        
+        Args:
+            llm_client: LLM客户端
+            system_prompt: 系统提示词
+        """
+        self.llm_client = llm_client
+        self.system_prompt = system_prompt
+        self.metrics_calculator = MetricsCalculator()
+    
+    def process_item(self, item: Dict[str, Any], index: int) -> Dict[str, Any]:
+        """
+        处理单个数据项
+        
+        Args:
+            item: 数据项
+            index: 数据项索引
+            
+        Returns:
+            处理结果
+        """
+        question = item['question']
+        text = item['choices']['text']
+        label = item['choices']['label']
+        prompt = item['prompt']
+        expected_answer = item['answer'].strip()
+
+        # 格式化选择项
+        formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
+        user_input = f"{question} {formatted_choices}. {prompt}"
+        
+        # 获取LLM响应
+        llm_answer = self.llm_client.get_response(user_input, self.system_prompt)
+
+        return {
+            'index': index,
+            'question': question,
+            'choices': item['choices'],
+            'answer': expected_answer,
+            'llm_answer': llm_answer
+        }
+    
+    def evaluate(self, data: List[Dict[str, Any]], max_workers: int = 5) -> Tuple[Dict[str, float], List[Dict[str, Any]]]:
+        """
+        评估数据集
+        
+        Args:
+            data: 数据集
+            max_workers: 最大工作线程数
+            
+        Returns:
+            评估指标和详细结果
+        """
+        results = []
+
+        logger.info(f"Starting evaluation with {max_workers} workers")
+        
+        with tqdm(total=len(data), desc="Processing items") as pbar:
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                # 提交所有任务
+                future_to_index = {
+                    executor.submit(self.process_item, item, i): i 
+                    for i, item in enumerate(data)
+                }
+
+                # 收集结果
+                for future in concurrent.futures.as_completed(future_to_index):
+                    try:
+                        result = future.result()
+                        results.append(result)
+                        pbar.update(1)
+                    except Exception as e:
+                        logger.error(f"Error processing item: {e}")
+                        pbar.update(1)
+
+        # 按索引排序结果
+        results.sort(key=lambda x: x['index'])
+        
+        # 计算指标
+        metrics = self.metrics_calculator.compute_metrics(results)
+        
+        logger.info("Evaluation completed successfully")
+        return metrics, results
diff --git a/eval_framework/src/llm_client.py b/eval_framework/src/llm_client.py
new file mode 100644
index 0000000..6ce60ef
--- /dev/null
+++ b/eval_framework/src/llm_client.py
@@ -0,0 +1,60 @@
+import logging
+import time
+from typing import Optional
+from openai import OpenAI
+
+logger = logging.getLogger(__name__)
+
+class LLMClient:
+    """LLM客户端，负责与API交互"""
+    
+    def __init__(self, api_key: str, base_url: str, model: str, 
+                 temperature: float = 0, max_retries: int = 10):
+        """
+        初始化LLM客户端
+        
+        Args:
+            api_key: API密钥
+            base_url: API基础URL
+            model: 模型名称
+            temperature: 温度参数
+            max_retries: 最大重试次数
+        """
+        self.client = OpenAI(api_key=api_key, base_url=base_url)
+        self.model = model
+        self.temperature = temperature
+        self.max_retries = max_retries
+        
+    def get_response(self, user_input: str, system_prompt: str) -> str:
+        """
+        获取LLM响应
+        
+        Args:
+            user_input: 用户输入
+            system_prompt: 系统提示词
+            
+        Returns:
+            LLM响应，失败时返回"error!"
+        """
+        retries = 0
+        while retries < self.max_retries:
+            try:
+                response = self.client.chat.completions.create(
+                    model=self.model,
+                    messages=[
+                        {"role": "system", "content": system_prompt},
+                        {"role": "user", "content": user_input}
+                    ],
+                    temperature=self.temperature
+                )
+                answer = response.choices[0].message.content
+                return answer
+                
+            except Exception as e:
+                retries += 1
+                logger.warning(f"API call failed (Attempt {retries}/{self.max_retries}): {e}")
+                if retries < self.max_retries:
+                    time.sleep(2 ** retries)  # 指数退避
+        
+        logger.error(f"Failed to get response after {self.max_retries} attempts")
+        return "error!"
diff --git a/eval_framework/src/metrics.py b/eval_framework/src/metrics.py
new file mode 100644
index 0000000..dcfb93d
--- /dev/null
+++ b/eval_framework/src/metrics.py
@@ -0,0 +1,111 @@
+import re
+import numpy as np
+from typing import List, Dict, Any, Optional
+from sklearn.metrics import precision_score, recall_score, f1_score
+import logging
+
+logger = logging.getLogger(__name__)
+
+class MetricsCalculator:
+    """评估指标计算器"""
+    
+    @staticmethod
+    def extract_answer(answer_string: str) -> Optional[str]:
+        """
+        从回答字符串中提取答案
+        
+        Args:
+            answer_string: 包含答案的字符串
+            
+        Returns:
+            提取的答案，如果没有找到返回None
+        """
+        if not answer_string:
+            return None
+            
+        match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', answer_string)
+        if match:
+            return match.group(1).strip()
+        return None
+
+    @staticmethod
+    def parse_answer(answer: Optional[str]) -> List[str]:
+        """
+        解析答案为列表
+        
+        Args:
+            answer: 答案字符串
+            
+        Returns:
+            答案列表
+        """
+        if answer is None:
+            return []
+        return [a.strip() for a in answer.split(',')]
+
+    @classmethod
+    def compute_metrics(cls, data: List[Dict[str, Any]]) -> Dict[str, float]:
+        """
+        计算评估指标
+        
+        Args:
+            data: 包含真实答案和预测答案的数据
+            
+        Returns:
+            各种评估指标的字典
+        """
+        true_answers = []
+        pred_answers = []
+        
+        # 提取和解析答案
+        for item in data:
+            true_ans = cls.extract_answer(item["answer"])
+            pred_ans = cls.extract_answer(item["llm_answer"])
+            
+            true_answers.append(cls.parse_answer(true_ans))
+            pred_answers.append(cls.parse_answer(pred_ans))
+        
+        # 计算准确率
+        correct_counts = []
+        for true_ans, pred_ans in zip(true_answers, pred_answers):
+            if true_ans and pred_ans and set(true_ans) == set(pred_ans):
+                correct_counts.append(1)
+            else:
+                correct_counts.append(0)
+
+        accuracy = np.mean(correct_counts)
+
+        # 构建多标签向量
+        all_labels = set()
+        for item in data:
+            choices = item["choices"]["label"]
+            for label in choices:
+                all_labels.add(label)
+        
+        all_labels = sorted(list(all_labels))
+        
+        y_true_multi = []
+        y_pred_multi = []
+        
+        for true_ans, pred_ans in zip(true_answers, pred_answers):
+            true_vector = [1 if label in (true_ans or []) else 0 for label in all_labels]
+            pred_vector = [1 if label in (pred_ans or []) else 0 for label in all_labels]
+            y_true_multi.append(true_vector)
+            y_pred_multi.append(pred_vector)
+        
+        y_true_multi = np.array(y_true_multi)
+        y_pred_multi = np.array(y_pred_multi)
+
+        # 计算各种指标
+        metrics = {
+            "accuracy": accuracy,
+            "precision_micro": precision_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "recall_micro": recall_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "f1_micro": f1_score(y_true_multi, y_pred_multi, average='micro', zero_division=0),
+            "precision_macro": precision_score(y_true_multi, y_pred_multi, average='macro', zero_division=0),
+            "recall_macro": recall_score(y_true_multi, y_pred_multi, average='macro', zero_division=0),
+            "f1_macro": f1_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
+        }
+        
+        logger.info("Metrics computed successfully")
+        return metrics
diff --git a/eval_framework/src/utils.py b/eval_framework/src/utils.py
new file mode 100644
index 0000000..652f4a2
--- /dev/null
+++ b/eval_framework/src/utils.py
@@ -0,0 +1,360 @@
+import json
+import yaml
+import logging
+import pandas as pd
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, Any, List
+from tabulate import tabulate
+
+def load_config(config_path: str) -> Dict[str, Any]:
+    """
+    加载配置文件
+    
+    Args:
+        config_path: 配置文件路径
+        
+    Returns:
+        配置字典
+    """
+    with open(config_path, 'r', encoding='utf-8') as f:
+        config = yaml.safe_load(f)
+    return config
+
+def get_models_from_config(config: Dict[str, Any]) -> List[str]:
+    """
+    从配置中获取模型列表
+    
+    Args:
+        config: 配置字典
+        
+    Returns:
+        模型名称列表
+    """
+    api_config = config['api']
+    
+    # 优先使用models列表
+    if 'models' in api_config and api_config['models']:
+        return api_config['models']
+    # 向后兼容：如果没有models，使用单个model
+    elif 'model' in api_config:
+        return [api_config['model']]
+    else:
+        raise ValueError("No models specified in configuration")
+
+def generate_output_dir(config: Dict[str, Any]) -> str:
+    """
+    生成输出目录路径
+    
+    Args:
+        config: 配置字典
+        
+    Returns:
+        输出目录路径
+    """
+    output_config = config['evaluation']['output']
+    base_dir = output_config['base_dir']
+    auto_timestamp = output_config.get('auto_timestamp', True)
+    
+    # 创建基础目录
+    base_path = Path(base_dir)
+    
+    if auto_timestamp:
+        # 创建时间戳文件夹 (年月日时分)
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+        output_dir = base_path / timestamp
+    else:
+        output_dir = base_path
+    
+    # 确保目录存在
+    output_dir.mkdir(parents=True, exist_ok=True)
+    
+    return str(output_dir)
+
+def generate_model_output_path(output_dir: str, model_name: str, filename_template: str) -> str:
+    """
+    为特定模型生成输出文件路径
+    
+    Args:
+        output_dir: 输出目录
+        model_name: 模型名称
+        filename_template: 文件名模板
+        
+    Returns:
+        完整的输出文件路径
+    """
+    # 处理模型名中的特殊字符
+    safe_model_name = model_name.replace('/', '_').replace(':', '_')
+    filename = filename_template.format(model=safe_model_name)
+    return str(Path(output_dir) / filename)
+
+def save_results(results: list, filepath: str) -> None:
+    """
+    保存结果到JSON文件
+    
+    Args:
+        results: 结果列表
+        filepath: 保存路径
+    """
+    # 确保目录存在
+    Path(filepath).parent.mkdir(parents=True, exist_ok=True)
+    
+    with open(filepath, 'w', encoding='utf-8') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+
+def save_metrics(metrics: Dict[str, float], filepath: str) -> None:
+    """
+    保存评估指标到JSON文件
+    
+    Args:
+        metrics: 指标字典
+        filepath: 保存路径
+    """
+    # 生成指标文件路径（在同一目录下）
+    metrics_path = Path(filepath).parent / f"{Path(filepath).stem}_metrics.json"
+    
+    # 添加时间戳和其他元信息
+    metrics_with_meta = {
+        "timestamp": datetime.now().isoformat(),
+        "metrics": metrics
+    }
+    
+    with open(metrics_path, 'w', encoding='utf-8') as f:
+        json.dump(metrics_with_meta, f, indent=2, ensure_ascii=False)
+
+def create_results_dataframe(all_results: Dict[str, Dict]) -> pd.DataFrame:
+    """
+    将所有模型的结果转换为DataFrame
+    
+    Args:
+        all_results: 所有模型的结果字典
+        
+    Returns:
+        包含所有模型指标的DataFrame
+    """
+    if not all_results:
+        return pd.DataFrame()
+    
+    # 收集所有模型的指标数据
+    data = []
+    for model_name, model_result in all_results.items():
+        row = {"Model": model_name}
+        row.update(model_result["metrics"])
+        row["Data Count"] = len(model_result["results"])
+        data.append(row)
+    
+    # 创建DataFrame
+    df = pd.DataFrame(data)
+    
+    # 将Model列设为索引
+    df = df.set_index("Model")
+    
+    # 对列进行排序（将Data Count放在最后）
+    metric_columns = [col for col in df.columns if col != "Data Count"]
+    df = df[metric_columns + ["Data Count"]]
+    
+    return df
+
+def save_summary(all_results: Dict[str, Dict], output_dir: str, summary_filename: str) -> None:
+    """
+    保存所有模型的汇总结果
+    
+    Args:
+        all_results: 所有模型的结果字典
+        output_dir: 输出目录
+        summary_filename: 汇总文件名
+    """
+    output_path = Path(output_dir)
+    
+    # 创建DataFrame
+    df = create_results_dataframe(all_results)
+    
+    if df.empty:
+        logging.warning("No results to save in summary")
+        return
+    
+    # 保存JSON格式的详细汇总
+    summary_path = output_path / summary_filename
+    summary_data = {
+        "timestamp": datetime.now().isoformat(),
+        "models_count": len(all_results),
+        "models": {}
+    }
+    
+    for model_name, model_result in all_results.items():
+        summary_data["models"][model_name] = {
+            "metrics": model_result["metrics"],
+            "data_count": len(model_result["results"])
+        }
+    
+    # 添加模型对比表
+    if len(all_results) > 1:
+        comparison = {}
+        metric_names = [col for col in df.columns if col != "Data Count"]
+        
+        for metric in metric_names:
+            comparison[metric] = df[metric].to_dict()
+        
+        summary_data["comparison"] = comparison
+    
+    with open(summary_path, 'w', encoding='utf-8') as f:
+        json.dump(summary_data, f, indent=2, ensure_ascii=False)
+    
+    # 保存CSV格式的汇总表格
+    csv_filename = summary_filename.replace('.json', '.csv')
+    csv_path = output_path / csv_filename
+    
+    # 重置索引以便模型名称也作为列保存
+    df_for_csv = df.reset_index()
+    df_for_csv.to_csv(csv_path, index=False, encoding='utf-8')
+    
+    # 保存Excel格式（如果需要）
+    excel_filename = summary_filename.replace('.json', '.xlsx')
+    excel_path = output_path / excel_filename
+    
+    try:
+        # 创建Excel文件，包含多个工作表
+        with pd.ExcelWriter(excel_path, engine='openpyxl') as writer:
+            # 主要结果表
+            df_for_csv.to_excel(writer, sheet_name='Summary', index=False)
+            
+            # 如果有多个模型，创建排名表
+            if len(all_results) > 1:
+                ranking_df = create_ranking_dataframe(df)
+                ranking_df.to_excel(writer, sheet_name='Rankings', index=False)
+    
+    except ImportError:
+        logging.warning("openpyxl not installed, skipping Excel export")
+    
+    logging.info(f"Summary saved to {summary_path}")
+    logging.info(f"CSV summary saved to {csv_path}")
+
+def create_ranking_dataframe(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    创建模型排名DataFrame
+    
+    Args:
+        df: 原始结果DataFrame
+        
+    Returns:
+        包含排名的DataFrame
+    """
+    # 排除非指标列
+    metric_columns = [col for col in df.columns if col != "Data Count"]
+    
+    # 为每个指标创建排名（假设数值越大越好，可以根据需要调整）
+    ranking_data = []
+    
+    for metric in metric_columns:
+        # 创建排名（降序，数值越大排名越前）
+        ranks = df[metric].rank(method='min', ascending=False)
+        
+        for model_name in df.index:
+            ranking_data.append({
+                'Model': model_name,
+                'Metric': metric,
+                'Value': df.loc[model_name, metric],
+                'Rank': int(ranks[model_name])
+            })
+    
+    ranking_df = pd.DataFrame(ranking_data)
+    return ranking_df
+
+def print_summary(all_results: Dict[str, Dict]) -> None:
+    """
+    打印所有模型的汇总结果
+    
+    Args:
+        all_results: 所有模型的结果字典
+    """
+    print("\n" + "="*100)
+    print("SUMMARY - ALL MODELS COMPARISON")
+    print("="*100)
+    
+    if not all_results:
+        print("No results to display")
+        return
+    
+    # 创建DataFrame
+    df = create_results_dataframe(all_results)
+    
+    if df.empty:
+        print("No valid results to display")
+        return
+    
+    # 使用tabulate打印美观的表格
+    print(tabulate(
+        df, 
+        headers=df.columns, 
+        tablefmt='grid',
+        floatfmt='.4f',
+        showindex=True
+    ))
+    
+    # 如果有多个模型，显示最佳模型
+    if len(all_results) > 1:
+        print("\n" + "-"*100)
+        print("BEST PERFORMERS BY METRIC:")
+        print("-"*100)
+        
+        metric_columns = [col for col in df.columns if col != "Data Count"]
+        
+        for metric in metric_columns:
+            best_model = df[metric].idxmax()
+            best_value = df.loc[best_model, metric]
+            print(f"{metric.upper():<20}: {best_model:<30} ({best_value:.4f})")
+    
+    print("="*100)
+
+def setup_logging(level: str = "INFO", format_str: str = None, log_dir: str = "logs") -> None:
+    """
+    设置日志配置
+    
+    Args:
+        level: 日志级别
+        format_str: 日志格式
+        log_dir: 日志目录
+    """
+    if format_str is None:
+        format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+    
+    # 创建日志目录
+    Path(log_dir).mkdir(parents=True, exist_ok=True)
+    
+    # 生成日志文件名（包含时间戳）
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M")
+    log_file = Path(log_dir) / f"evaluation_{timestamp}.log"
+    
+    logging.basicConfig(
+        level=getattr(logging, level.upper()),
+        format=format_str,
+        handlers=[
+            logging.StreamHandler(),
+            logging.FileHandler(log_file, encoding='utf-8')
+        ]
+    )
+
+def print_metrics(metrics: Dict[str, float], model_name: str = None) -> None:
+    """
+    打印评估指标
+    
+    Args:
+        metrics: 指标字典
+        model_name: 模型名称
+    """
+    title = f"EVALUATION RESULTS - {model_name}" if model_name else "EVALUATION RESULTS"
+    print("\n" + "="*60)
+    print(title)
+    print("="*60)
+    
+    # 创建单行DataFrame用于美观显示
+    df = pd.DataFrame([metrics])
+    print(tabulate(
+        df, 
+        headers=df.columns, 
+        tablefmt='grid',
+        floatfmt='.4f',
+        showindex=False
+    ))
+    
+    print("="*60)
diff --git a/layer1/ALL-merge/eval.py b/layer1/ALL-merge/eval.py
index f914e59..e69de29 100644
--- a/layer1/ALL-merge/eval.py
+++ b/layer1/ALL-merge/eval.py
@@ -1,166 +0,0 @@
-import json
-import threading
-from tqdm import tqdm
-import concurrent.futures
-from openai import OpenAI
-import numpy as np
-from sklearn.metrics import precision_score, recall_score, f1_score
-import re
-
-client = OpenAI(
-    api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
-    base_url="https://vip.apiyi.com/v1"
-)
-
-thread_lock = threading.Lock()
-
-def load_json_data(filepath):
-    with open(filepath, 'r') as file:
-        data = json.load(file)
-    return data
-
-def get_response(input,max_retries=10):
-    retries = 0
-    while retries<max_retries:
-        try:
-            response = client.chat.completions.create(
-                #
-                model="qwen-max-2025-01-25",
-                messages= [
-                    {"role": "system", "content": "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."},
-                    {"role": "user", "content": input}
-                ],
-                temperature=0
-            )
-            answer = response.choices[0].message.content
-            return answer
-        except Exception as e:
-            print(f"Error in getting LLM response (Attempt {retries + 1}/{max_retries}): {e}")
-            retries += 1
-    
-    print(f"Failed to get response after {max_retries} attempts, returning None.")
-    return "error!"
-
-def process_item(item, index):
-    question = item['question']
-    text = item['choices']['text']
-    label = item['choices']['label']
-    prompt = item['prompt']
-    expected_answer = item['answer'].strip()
-
-    formatted_choices = " ".join([f"({label}) {text}" for label, text in zip(label, text)])
-    input = f"{question} {formatted_choices}. {prompt}"
-    
-    llm_answer = get_response(input)
-
-    return {
-        'index': index,
-        'question': question,
-        'choices': item['choices'],
-        'answer': expected_answer,
-        'llm_answer': llm_answer
-    }
-
-def extract_answer(answer_string):
-    match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', answer_string)
-    if match:
-        return match.group(1).strip()
-    return None
-
-
-def parse_answer(answer):
-    if answer is None:
-        return []
-    return [a.strip() for a in answer.split(',')]
-
-def compute_metrics(data):
-
-    true_answers = []
-    pred_answers = []
-    
-    for item in data:
-        true_ans = extract_answer(item["answer"])
-        pred_ans = extract_answer(item["llm_answer"])
-        
-        true_answers.append(parse_answer(true_ans))
-        pred_answers.append(parse_answer(pred_ans))
-    
-    correct_counts = []
-    for true_ans, pred_ans in zip(true_answers, pred_answers):
-        if true_ans and pred_ans and set(true_ans) == set(pred_ans):
-            correct_counts.append(1)
-        else:
-            correct_counts.append(0)
-
-    accuracy = np.mean(correct_counts)
-
-    y_true_multi = []
-    y_pred_multi = []
-    all_labels = set()
-    
-    for item in data:
-        choices = item["choices"]["label"]
-        for label in choices:
-            all_labels.add(label)
-    
-    all_labels = sorted(list(all_labels))
-    
-    for true_ans, pred_ans in zip(true_answers, pred_answers):
-        true_vector = [1 if label in true_ans else 0 for label in all_labels]
-        pred_vector = [1 if label in pred_ans else 0 for label in all_labels]
-        y_true_multi.append(true_vector)
-        y_pred_multi.append(pred_vector)
-    
-    y_true_multi = np.array(y_true_multi)
-    y_pred_multi = np.array(y_pred_multi)
-
-    precision_micro = precision_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    recall_micro = recall_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    f1_micro = f1_score(y_true_multi, y_pred_multi, average='micro', zero_division=0)
-    
-    precision_macro = precision_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    recall_macro = recall_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    f1_macro = f1_score(y_true_multi, y_pred_multi, average='macro', zero_division=0)
-    
-    return {
-        "accuracy": accuracy,
-        "precision_micro": precision_micro,
-        "recall_micro": recall_micro,
-        "f1_micro": f1_micro,
-        "precision_macro": precision_macro,
-        "recall_macro": recall_macro,
-        "f1_macro": f1_macro
-    }
-
-def calculate_accuracy_multithreaded(data, max_workers=5):
-    results = []
-
-    with tqdm(total=len(data), desc="Processing items") as pbar:
-        with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
-
-            future_to_index = {executor.submit(process_item, item, i): i for i, item in enumerate(data)}
-
-            for future in concurrent.futures.as_completed(future_to_index):
-                result = future.result()
-                results.append(result)
-                pbar.update(1)
-
-    results.sort(key=lambda x: x['index'])
-
-    metric = compute_metrics(results)
-
-    return metric, results
-
-def main():
-    filepath = '/home/ubuntu/50T/fsy/benchmark-dataset-third/ALL-merge/merged.json'
-    data = load_json_data(filepath)
-    max_workers = 8
-
-    metric, results = calculate_accuracy_multithreaded(data,max_workers)
-    print(f"Accuracy of qwen-max-2025-01-25: {metric}")
-
-    with open('qwen-max-2025-01-25.json', 'w') as f:
-        json.dump(results, f, indent=2)
-
-if __name__ == "__main__":
-    main()
diff --git a/logs/evaluation_20250528_1530.log b/logs/evaluation_20250528_1530.log
new file mode 100644
index 0000000..33b6c49
--- /dev/null
+++ b/logs/evaluation_20250528_1530.log
@@ -0,0 +1,40 @@
+2025-05-28 15:30:36,536 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:30:36,536 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:30:36,543 - __main__ - INFO - Output directory: results/20250528_1530
+2025-05-28 15:30:36,543 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:30:36,568 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:30:36,569 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:30:36,569 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:30:36,569 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:30:36,569 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:30:36,595 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:30:38,447 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,461 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,485 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,499 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,549 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,613 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:38,630 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:39,998 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:40,267 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:40,287 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:30:40,288 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:30:40,302 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1530/qwen-max-2025-01-25.json
+2025-05-28 15:30:40,302 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:30:40,302 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:30:40,352 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:30:41,778 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:41,794 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:41,826 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,016 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,026 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,040 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,041 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,076 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,295 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:30:42,323 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:30:42,323 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:30:42,333 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1530/gpt-4o.json
+2025-05-28 15:30:42,333 - __main__ - ERROR - Evaluation failed: 'summary_filename'
diff --git a/logs/evaluation_20250528_1531.log b/logs/evaluation_20250528_1531.log
new file mode 100644
index 0000000..2e34b71
--- /dev/null
+++ b/logs/evaluation_20250528_1531.log
@@ -0,0 +1,41 @@
+2025-05-28 15:31:25,896 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:31:25,896 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:31:25,899 - __main__ - INFO - Output directory: results/20250528_1531
+2025-05-28 15:31:25,899 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:31:25,925 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:31:25,927 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:31:25,927 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:31:25,927 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:31:25,927 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:31:25,952 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:31:28,342 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,434 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,444 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,538 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:28,703 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,085 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,353 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:30,374 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:31:30,374 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:31:30,387 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1531/qwen-max-2025-01-25.json
+2025-05-28 15:31:30,387 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:31:30,387 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:31:30,436 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:31:31,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:31,886 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,119 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,139 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,140 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,162 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,449 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:32,539 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:38,330 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:31:38,351 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:31:38,351 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:31:38,366 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1531/gpt-4o.json
+2025-05-28 15:31:38,372 - __main__ - INFO - Summary saved to results/20250528_1531/summary.json
+2025-05-28 15:31:38,372 - __main__ - INFO - Multi-model evaluation completed successfully
diff --git a/logs/evaluation_20250528_1535.log b/logs/evaluation_20250528_1535.log
new file mode 100644
index 0000000..54d651d
--- /dev/null
+++ b/logs/evaluation_20250528_1535.log
@@ -0,0 +1,44 @@
+2025-05-28 15:35:59,778 - __main__ - INFO - Starting multi-model evaluation framework
+2025-05-28 15:35:59,779 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
+2025-05-28 15:35:59,782 - __main__ - INFO - Output directory: results/20250528_1535
+2025-05-28 15:35:59,782 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:35:59,808 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
+2025-05-28 15:35:59,809 - src.data_loader - INFO - Validated 3023 out of 3023 items
+2025-05-28 15:35:59,809 - __main__ - INFO - Loaded 3023 valid data items
+2025-05-28 15:35:59,809 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
+2025-05-28 15:35:59,809 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
+2025-05-28 15:35:59,835 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:36:01,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,780 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,809 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,853 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,876 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:01,910 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:02,847 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:02,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:03,432 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:03,454 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:36:03,454 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:36:03,477 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1535/qwen-max-2025-01-25.json
+2025-05-28 15:36:03,480 - __main__ - INFO - Evaluating model 2/2: gpt-4o
+2025-05-28 15:36:03,481 - __main__ - INFO - Starting evaluation for model: gpt-4o
+2025-05-28 15:36:03,534 - src.evaluator - INFO - Starting evaluation with 8 workers
+2025-05-28 15:36:04,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,895 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,930 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,495 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
+2025-05-28 15:36:05,514 - src.metrics - INFO - Metrics computed successfully
+2025-05-28 15:36:05,515 - src.evaluator - INFO - Evaluation completed successfully
+2025-05-28 15:36:05,532 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1535/gpt-4o.json
+2025-05-28 15:36:05,564 - root - WARNING - openpyxl not installed, skipping Excel export
+2025-05-28 15:36:05,564 - root - INFO - Summary saved to results/20250528_1535/summary.json
+2025-05-28 15:36:05,564 - root - INFO - CSV summary saved to results/20250528_1535/summary.csv
+2025-05-28 15:36:05,568 - __main__ - INFO - Summary saved to results/20250528_1535/summary.json
+2025-05-28 15:36:05,568 - __main__ - INFO - Multi-model evaluation completed successfully
diff --git a/results/20250528_1530/gpt-4o.json b/results/20250528_1530/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1530/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1530/gpt-4o_metrics.json b/results/20250528_1530/gpt-4o_metrics.json
new file mode 100644
index 0000000..8d6d171
--- /dev/null
+++ b/results/20250528_1530/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:30:42.329641",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1530/qwen-max-2025-01-25.json b/results/20250528_1530/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1530/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1530/qwen-max-2025-01-25_metrics.json b/results/20250528_1530/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..00a04d8
--- /dev/null
+++ b/results/20250528_1530/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:30:40.296801",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/gpt-4o.json b/results/20250528_1531/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1531/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1531/gpt-4o_metrics.json b/results/20250528_1531/gpt-4o_metrics.json
new file mode 100644
index 0000000..2d9eadb
--- /dev/null
+++ b/results/20250528_1531/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:31:38.361064",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/qwen-max-2025-01-25.json b/results/20250528_1531/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1531/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1531/qwen-max-2025-01-25_metrics.json b/results/20250528_1531/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..cc49ec9
--- /dev/null
+++ b/results/20250528_1531/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:31:30.382105",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1531/summary.json b/results/20250528_1531/summary.json
new file mode 100644
index 0000000..c40d636
--- /dev/null
+++ b/results/20250528_1531/summary.json
@@ -0,0 +1,60 @@
+{
+  "timestamp": "2025-05-28T15:31:38.366535",
+  "models_count": 2,
+  "models": {
+    "qwen-max-2025-01-25": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    },
+    "gpt-4o": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    }
+  },
+  "comparison": {
+    "accuracy": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    }
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/gpt-4o.json b/results/20250528_1535/gpt-4o.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1535/gpt-4o.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1535/gpt-4o_metrics.json b/results/20250528_1535/gpt-4o_metrics.json
new file mode 100644
index 0000000..c21653f
--- /dev/null
+++ b/results/20250528_1535/gpt-4o_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:36:05.524328",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/qwen-max-2025-01-25.json b/results/20250528_1535/qwen-max-2025-01-25.json
new file mode 100644
index 0000000..b6d5d9a
--- /dev/null
+++ b/results/20250528_1535/qwen-max-2025-01-25.json
@@ -0,0 +1,202 @@
+[
+  {
+    "index": 0,
+    "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
+    "choices": {
+      "text": [
+        "the atom",
+        "the electron",
+        "the nucleus",
+        "the proton"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 1,
+    "question": "Which statement correctly describes a property of a type of matter?",
+    "choices": {
+      "text": [
+        "Air is a mixture of gases.",
+        "Ice is a mixture of gases.",
+        "Air is a liquid.",
+        "Ice is a liquid."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 2,
+    "question": "Which statement best explains why a tree branch floats on water?",
+    "choices": {
+      "text": [
+        "Wood is porous.",
+        "Wood is buoyant.",
+        "Wood is light.",
+        "Wood is magnetic."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 3,
+    "question": "The best way to separate salt from water is with the use of",
+    "choices": {
+      "text": [
+        "oil.",
+        "heat.",
+        "a magnet.",
+        "rubbing alcohol."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 4,
+    "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
+    "choices": {
+      "text": [
+        "the frequency of the wave",
+        "the wavelength of the wave",
+        "the source that created the sound",
+        "the distance between molecules in the medium"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]D[/ANSWER]",
+    "llm_answer": "[ANSWER]D[/ANSWER]"
+  },
+  {
+    "index": 5,
+    "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
+    "choices": {
+      "text": [
+        "W is the softest of the four substances tested.",
+        "W is the hardest of the four substances tested.",
+        "W can scratch Y.",
+        "W can scratch X."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]A[/ANSWER]",
+    "llm_answer": "[ANSWER]A[/ANSWER]"
+  },
+  {
+    "index": 6,
+    "question": "When the temperature of a sample of 25 water is -5°C, the water is",
+    "choices": {
+      "text": [
+        "a gas.",
+        "a liquid.",
+        "a solid.",
+        "a vapor."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 7,
+    "question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
+    "choices": {
+      "text": [
+        "a large funnel",
+        "a screen filter",
+        "a horseshoe magnet",
+        "a magnifying glass"
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]C[/ANSWER]",
+    "llm_answer": "[ANSWER]C[/ANSWER]"
+  },
+  {
+    "index": 8,
+    "question": "How are sedimentary rocks made?",
+    "choices": {
+      "text": [
+        "Magma or lava is cooled.",
+        "Materials are pressed together.",
+        "Chemical reactions change minerals.",
+        "Earthquakes cause small pieces to fall."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  },
+  {
+    "index": 9,
+    "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
+    "choices": {
+      "text": [
+        "The ball makes light.",
+        "The ball reflects light.",
+        "The ball absorbs light and then releases it.",
+        "The ball absorbs light and keeps it inside."
+      ],
+      "label": [
+        "A",
+        "B",
+        "C",
+        "D"
+      ]
+    },
+    "answer": "[ANSWER]B[/ANSWER]",
+    "llm_answer": "[ANSWER]B[/ANSWER]"
+  }
+]
\ No newline at end of file
diff --git a/results/20250528_1535/qwen-max-2025-01-25_metrics.json b/results/20250528_1535/qwen-max-2025-01-25_metrics.json
new file mode 100644
index 0000000..f706817
--- /dev/null
+++ b/results/20250528_1535/qwen-max-2025-01-25_metrics.json
@@ -0,0 +1,12 @@
+{
+  "timestamp": "2025-05-28T15:36:03.466534",
+  "metrics": {
+    "accuracy": 1.0,
+    "precision_micro": 1.0,
+    "recall_micro": 1.0,
+    "f1_micro": 1.0,
+    "precision_macro": 1.0,
+    "recall_macro": 1.0,
+    "f1_macro": 1.0
+  }
+}
\ No newline at end of file
diff --git a/results/20250528_1535/summary.csv b/results/20250528_1535/summary.csv
new file mode 100644
index 0000000..661edb4
--- /dev/null
+++ b/results/20250528_1535/summary.csv
@@ -0,0 +1,3 @@
+Model,accuracy,precision_micro,recall_micro,f1_micro,precision_macro,recall_macro,f1_macro,Data Count
+qwen-max-2025-01-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
+gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
diff --git a/results/20250528_1535/summary.json b/results/20250528_1535/summary.json
new file mode 100644
index 0000000..5520f4a
--- /dev/null
+++ b/results/20250528_1535/summary.json
@@ -0,0 +1,60 @@
+{
+  "timestamp": "2025-05-28T15:36:05.540751",
+  "models_count": 2,
+  "models": {
+    "qwen-max-2025-01-25": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    },
+    "gpt-4o": {
+      "metrics": {
+        "accuracy": 1.0,
+        "precision_micro": 1.0,
+        "recall_micro": 1.0,
+        "f1_micro": 1.0,
+        "precision_macro": 1.0,
+        "recall_macro": 1.0,
+        "f1_macro": 1.0
+      },
+      "data_count": 10
+    }
+  },
+  "comparison": {
+    "accuracy": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_micro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "precision_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "recall_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    },
+    "f1_macro": {
+      "qwen-max-2025-01-25": 1.0,
+      "gpt-4o": 1.0
+    }
+  }
+}
\ No newline at end of file