From 9abd8fc1c50391e3111feae3ae8b8d560facd140 Mon Sep 17 00:00:00 2001 From: lzy <949777411@qq.com> Date: Wed, 28 May 2025 15:43:50 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84eval=E4=BB=A3=E7=A0=81?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- eval_framework/config/config.yaml | 36 ++ eval_framework/main.py | 164 ++++++++ eval_framework/src/__init__.py | 26 ++ .../src/__pycache__/__init__.cpython-311.pyc | Bin 0 -> 741 bytes .../src/__pycache__/__init__.cpython-312.pyc | Bin 0 -> 736 bytes .../__pycache__/data_loader.cpython-311.pyc | Bin 0 -> 4253 bytes .../__pycache__/data_loader.cpython-312.pyc | Bin 0 -> 3708 bytes .../src/__pycache__/evaluator.cpython-311.pyc | Bin 0 -> 5717 bytes .../src/__pycache__/evaluator.cpython-312.pyc | Bin 0 -> 4719 bytes .../__pycache__/llm_client.cpython-311.pyc | Bin 0 -> 3002 bytes .../__pycache__/llm_client.cpython-312.pyc | Bin 0 -> 2900 bytes .../src/__pycache__/metrics.cpython-311.pyc | Bin 0 -> 5731 bytes .../src/__pycache__/metrics.cpython-312.pyc | Bin 0 -> 4623 bytes .../src/__pycache__/utils.cpython-311.pyc | Bin 0 -> 6339 bytes .../src/__pycache__/utils.cpython-312.pyc | Bin 0 -> 13397 bytes eval_framework/src/data_loader.py | 81 ++++ eval_framework/src/evaluator.py | 98 +++++ eval_framework/src/llm_client.py | 60 +++ eval_framework/src/metrics.py | 111 ++++++ eval_framework/src/utils.py | 360 ++++++++++++++++++ layer1/ALL-merge/eval.py | 166 -------- logs/evaluation_20250528_1530.log | 40 ++ logs/evaluation_20250528_1531.log | 41 ++ logs/evaluation_20250528_1535.log | 44 +++ results/20250528_1530/gpt-4o.json | 202 ++++++++++ results/20250528_1530/gpt-4o_metrics.json | 12 + .../20250528_1530/qwen-max-2025-01-25.json | 202 ++++++++++ .../qwen-max-2025-01-25_metrics.json | 12 + results/20250528_1531/gpt-4o.json | 202 ++++++++++ results/20250528_1531/gpt-4o_metrics.json | 12 + .../20250528_1531/qwen-max-2025-01-25.json | 202 ++++++++++ .../qwen-max-2025-01-25_metrics.json | 12 + results/20250528_1531/summary.json | 60 +++ results/20250528_1535/gpt-4o.json | 202 ++++++++++ results/20250528_1535/gpt-4o_metrics.json | 12 + .../20250528_1535/qwen-max-2025-01-25.json | 202 ++++++++++ .../qwen-max-2025-01-25_metrics.json | 12 + results/20250528_1535/summary.csv | 3 + results/20250528_1535/summary.json | 60 +++ 39 files changed, 2468 insertions(+), 166 deletions(-) create mode 100644 eval_framework/config/config.yaml create mode 100644 eval_framework/main.py create mode 100644 eval_framework/src/__init__.py create mode 100644 eval_framework/src/__pycache__/__init__.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/__init__.cpython-312.pyc create mode 100644 eval_framework/src/__pycache__/data_loader.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/data_loader.cpython-312.pyc create mode 100644 eval_framework/src/__pycache__/evaluator.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/evaluator.cpython-312.pyc create mode 100644 eval_framework/src/__pycache__/llm_client.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/llm_client.cpython-312.pyc create mode 100644 eval_framework/src/__pycache__/metrics.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/metrics.cpython-312.pyc create mode 100644 eval_framework/src/__pycache__/utils.cpython-311.pyc create mode 100644 eval_framework/src/__pycache__/utils.cpython-312.pyc create mode 100644 eval_framework/src/data_loader.py create mode 100644 eval_framework/src/evaluator.py create mode 100644 eval_framework/src/llm_client.py create mode 100644 eval_framework/src/metrics.py create mode 100644 eval_framework/src/utils.py create mode 100644 logs/evaluation_20250528_1530.log create mode 100644 logs/evaluation_20250528_1531.log create mode 100644 logs/evaluation_20250528_1535.log create mode 100644 results/20250528_1530/gpt-4o.json create mode 100644 results/20250528_1530/gpt-4o_metrics.json create mode 100644 results/20250528_1530/qwen-max-2025-01-25.json create mode 100644 results/20250528_1530/qwen-max-2025-01-25_metrics.json create mode 100644 results/20250528_1531/gpt-4o.json create mode 100644 results/20250528_1531/gpt-4o_metrics.json create mode 100644 results/20250528_1531/qwen-max-2025-01-25.json create mode 100644 results/20250528_1531/qwen-max-2025-01-25_metrics.json create mode 100644 results/20250528_1531/summary.json create mode 100644 results/20250528_1535/gpt-4o.json create mode 100644 results/20250528_1535/gpt-4o_metrics.json create mode 100644 results/20250528_1535/qwen-max-2025-01-25.json create mode 100644 results/20250528_1535/qwen-max-2025-01-25_metrics.json create mode 100644 results/20250528_1535/summary.csv create mode 100644 results/20250528_1535/summary.json diff --git a/eval_framework/config/config.yaml b/eval_framework/config/config.yaml new file mode 100644 index 0000000..8ef7f91 --- /dev/null +++ b/eval_framework/config/config.yaml @@ -0,0 +1,36 @@ +# API配置 +api: + key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d" + base_url: "https://vip.apiyi.com/v1" + temperature: 0 + max_retries: 10 + # 支持多个模型 + models: + - "qwen-max-2025-01-25" + - "gpt-4o" + # 或者使用单个模型(向后兼容) + # model: "qwen-max-2025-01-25" + +# 系统提示词 +system_prompt: "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications." + +# 评估配置 +evaluation: + max_workers: 8 + input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json" + # 输出配置 + output: + base_dir: "results" + auto_timestamp: true + filename_template: "{model}.json" + summary_filename: "summary.json" + # 输出格式选项 + export_formats: + - "json" # 详细JSON结果 + - "csv" # CSV表格 + - "excel" # Excel表格(需要openpyxl) + +# 日志配置 +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" diff --git a/eval_framework/main.py b/eval_framework/main.py new file mode 100644 index 0000000..6c7bf2b --- /dev/null +++ b/eval_framework/main.py @@ -0,0 +1,164 @@ +import argparse +import logging +from pathlib import Path +from typing import Dict, Any + +from src import ( + DataLoader, LLMClient, Evaluator, + load_config, save_results, save_metrics, save_summary, + setup_logging, print_metrics, print_summary, + get_models_from_config, generate_output_dir, generate_model_output_path +) + +logger = logging.getLogger(__name__) + +def evaluate_single_model( + model_name: str, + data: list, + config: Dict[str, Any], + output_dir: str +) -> Dict[str, Any]: + """ + 评估单个模型 + + Args: + model_name: 模型名称 + data: 评估数据 + config: 配置字典 + output_dir: 输出目录 + + Returns: + 包含指标和结果的字典 + """ + logger.info(f"Starting evaluation for model: {model_name}") + + # 初始化LLM客户端 + llm_client = LLMClient( + api_key=config['api']['key'], + base_url=config['api']['base_url'], + model=model_name, + temperature=config['api']['temperature'], + max_retries=config['api']['max_retries'] + ) + + # 初始化评估器 + evaluator = Evaluator( + llm_client=llm_client, + system_prompt=config['system_prompt'] + ) + + # 执行评估 + max_workers = config['evaluation']['max_workers'] + metrics, results = evaluator.evaluate(data, max_workers=max_workers) + + # 生成输出文件路径 + filename_template = config['evaluation']['output']['filename_template'] + output_file = generate_model_output_path(output_dir, model_name, filename_template) + + # 保存结果和指标 + save_results(results, output_file) + save_metrics(metrics, output_file) + + logger.info(f"Model {model_name} evaluation completed. Results saved to {output_file}") + + return { + "metrics": metrics, + "results": results, + "output_file": output_file + } + +def main(): + parser = argparse.ArgumentParser(description="材料科学LLM评估框架") + parser.add_argument("--config", default="eval_framework/config/config.yaml", help="配置文件路径") + parser.add_argument("--input", help="输入数据文件路径(覆盖配置文件)") + parser.add_argument("--output-dir", help="输出目录路径(覆盖配置文件)") + parser.add_argument("--workers", type=int, help="工作线程数(覆盖配置文件)") + parser.add_argument("--models", nargs="+", help="指定要评估的模型列表(覆盖配置文件)") + parser.add_argument("--no-timestamp", action="store_true", help="不使用时间戳文件夹") + + args = parser.parse_args() + + # 加载配置 + config = load_config(args.config) + + # 如果指定了不使用时间戳,修改配置 + if args.no_timestamp: + config['evaluation']['output']['auto_timestamp'] = False + + # 设置日志 + setup_logging( + level=config.get('logging', {}).get('level', 'INFO'), + format_str=config.get('logging', {}).get('format') + ) + + logger.info("Starting multi-model evaluation framework") + + # 处理输入路径和工作线程数 + input_file = args.input or config['evaluation']['input_file'] + if args.workers: + config['evaluation']['max_workers'] = args.workers + + # 获取模型列表 + if args.models: + models = args.models + logger.info(f"Using models from command line: {models}") + else: + models = get_models_from_config(config) + logger.info(f"Using models from config: {models}") + + # 生成输出目录 + if args.output_dir: + output_dir = args.output_dir + Path(output_dir).mkdir(parents=True, exist_ok=True) + else: + output_dir = generate_output_dir(config) + + logger.info(f"Output directory: {output_dir}") + + try: + # 加载数据 + logger.info(f"Loading data from {input_file}") + data = DataLoader.load_and_validate_data(input_file) + + if not data: + logger.error("No valid data found") + return + + logger.info(f"Loaded {len(data)} valid data items") + + # 存储所有模型的结果 + all_results = {} + + # 逐个评估模型 + for i, model_name in enumerate(models, 1): + logger.info(f"Evaluating model {i}/{len(models)}: {model_name}") + + try: + model_result = evaluate_single_model(model_name, data[:10], config, output_dir) + all_results[model_name] = model_result + + # 打印当前模型的结果 + print_metrics(model_result["metrics"], model_name) + + except Exception as e: + logger.error(f"Failed to evaluate model {model_name}: {e}") + continue + + # 保存汇总结果 + if all_results: + summary_filename = config['evaluation']['output']['summary_filename'] + save_summary(all_results, output_dir, summary_filename) + + # 打印汇总对比 + print_summary(all_results) + + logger.info(f"Summary saved to {Path(output_dir) / summary_filename}") + + logger.info("Multi-model evaluation completed successfully") + + except Exception as e: + logger.error(f"Evaluation failed: {e}") + raise + +if __name__ == "__main__": + main() diff --git a/eval_framework/src/__init__.py b/eval_framework/src/__init__.py new file mode 100644 index 0000000..d534827 --- /dev/null +++ b/eval_framework/src/__init__.py @@ -0,0 +1,26 @@ +from .data_loader import DataLoader +from .llm_client import LLMClient +from .evaluator import Evaluator +from .metrics import MetricsCalculator +from .utils import ( + load_config, save_results, save_metrics, save_summary, + setup_logging, print_metrics, print_summary, + get_models_from_config, generate_output_dir, generate_model_output_path +) + +__all__ = [ + 'DataLoader', + 'LLMClient', + 'Evaluator', + 'MetricsCalculator', + 'load_config', + 'save_results', + 'save_metrics', + 'save_summary', + 'setup_logging', + 'print_metrics', + 'print_summary', + 'get_models_from_config', + 'generate_output_dir', + 'generate_model_output_path' +] diff --git a/eval_framework/src/__pycache__/__init__.cpython-311.pyc b/eval_framework/src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..430cb87e831b351d040f4800820f084f95f89f82 GIT binary patch literal 741 zcmb`EF>ll`6vyr4a=9dzQlyF*gjDI4h6SdIfx>pjLKi$>nZ(USJSUN37qRgzx_3cc z8Tk}9p1d+3wv)Pb!ZvrqEuHE-{q%pf{QmF7Kk__9c4fa_E&pJIe!FD1f~~myWW^P# zP=x^rOfYsjfIx&K^g4u*h)Cr12*x5NvDXnyL_!jgl2l|Q6H_u3ImyKVIl!n!)%Y5d zM^*d*%_e`XJDOpWzGaHdJ63UNvt&MBoI%4|<&t-o3^Y@nJ9)g|N;c(i#-P;TH^(Le zdy$r1d(qTpIwAR@s59lbgs_Fe!kbrJu@IGYQ`f<_nv6 z+lY5CkyZ^1_Zyj*QU*Zj4Z5|y=3xUcNXl<{!_C1ygT$TP_P{NG~K4 z{A(w_6oV`aN}IM(l%Dk8?k1gB}iPmf3nce2?8Hk8Ioj0I_t$N&o-= literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/__init__.cpython-312.pyc b/eval_framework/src/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58a17be226cba59257eb1e5ca815c26ba457282a GIT binary patch literal 736 zcmb7?y^ho{5XbFgv)O#@#~niGDi9KA8*&X)CkhU_U8-wvW0}O4jj)rIKMt)#!&A^- z$20HUGM=NH7pUM%a)Z=T2j(72L@)O#UOK8&9M3GfbYAUQsR;%?nYiXyPcymRVCaQOy#hNP7 zR`Q&%N;AKinvB^*T=m^$+n8LED~bh`nkjiu{BP1hNzFMCJ5xxi^cJ(eY1*za#a6VP z!b9vq4#D>uYJGi8nZ%c(=R>*UhIUjCMRBjyRx4b$!kqlu^cjO|OVnmL6`{Sd7&$}^ zV~5yb;*dDFer@KfNed?=H$UQLkuMZA#aKof_fv zV=q3GQdA|zZPzM{pKW({lQTOe&FD*UVh?jKLO!+l6+#I2;PDfa!Qd7Q O_?^$P;Bzb<*w{besLLJz literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/data_loader.cpython-311.pyc b/eval_framework/src/__pycache__/data_loader.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..07ff9cb200a29f24848e7966a9c8dec83d0b0f1c GIT binary patch literal 4253 zcmbsrZERE5^}a8E#l+4BgoKv7j6kqTal1l7S=TmcfGu&;(V$6Woh;+`oR~Uxc<-4I z%ULN)H8%<+tLT!hpqaJH!hn(=vzboATvlrcrpXDKn*Y=E6^e*FrrB`i=1c)(2a&< z5eu*aJHgHY<%&Tau!1aCDFwQ0z;CMHuM+Lw(U!SFc7tGnkTB0fQsqR+xq zRz6x-Is18bGP!!|R`%oh|1M3f-uT_>jZgCN>y zGwaRw3|#(k<`>K;HH!2-a@nFg;b?4pzTE%ngP#63`c{7PtNUMlsh_-l;0A|*bV`Hr zexrxw3s+)4U(~Vs=j++8Cra3i&YR@olqu1T{^z$7`TLldS-muGh+s2D=S^Qw7R$sB z!nN#+CjQ=#go2}Hz4_j#nFiLwp*6`ZmhyfmMZNY8PlpU>c!LbVoI#B z7+!Xll5o_QTj186M!BsvYwdE)#_6FXGe4--w5Mv?(>3imL~UqWY+Y{Ha(*mX|Lb?o zzcbC|7#Jf20v=+PZ!jp1t%p5+{CK_LtiASJ?_{sK`6t)pg*}(Yu8si;_4bh7rtF8) z_QR_EaHg&y(ffGW!RHWdtu2COP~e&LsEzIOKLP+XW$#GaJ5+l|#$L1Rs7pJx6}bs! zh`hS6S#5k3>f&L&P1!rs_D)21-&5mMoa#nD z-!V?d={Pz`NxTBoLP@R(*@ZC#c{ff&XsHNX6y(-^G}R`s5Z<|Bx;sh9HXJ@n$E2se zgQcQnV6VroBH&Kn+IR>W zZG_T>gGb)P5f?2vv5cSz!D{Va9S-mdk~yhB%bWW zdlGwkjXPJ8w}Y5`767>PCbZ=sr8XUans8(`w&oCHeVTy8fsCVmwp(@ZP?w#XW~x7| zR+~E)w*!Eha(1PiU8=Jy)9}=EbUK>jkh3LcLH63&`q|)xX0>W3)cHe;dv05oUQu`T zK*c`QH`%9d=}g(X()KRZ-j%7_JiYgxqb{L5f(ar2ntY0I2qbP0fXh9D;wXt(DoW*x zV-P|1#gh3_{zd>d>U#?;jkDt%XkBZwAcEpt>H9(26laoabQ+8moD`aRPBGpB@QT6i zTu}rn&O+{|FEXWeK>Cdm={4`6q7s2o9K~L|Ob6&?OXy_*0O;Jsvh~6H2ttC(6rBgX zhD+q1lQ+Ly>$L{eKq_4mCocTp{J&loy3y?P2Tyun#J2o6Ozt<2vlvO>yiWnw62wz{ z%(jb{-wuw9fJen_Kh1jyWO@r9i7I>~z{gB_&Uaa)CRnv*6~obCQ379*cHq9rHv(y0 zAkHamgAu6)gIWMClg9W%GD*`?17^07i4Z5$s337*8J`_EDiqyUH<8@x3k%*7uZ*>< z@z?cPUII<>*8qsmI_vH_TJAVnk~>q5XVQ*m5^QE;L&BDA;PnKYL#)+9>G1tSiOz(Q zan#;*Y`^2!o)mNu@bL{ED&uUr>ukB>Y)Q7JoNZ}mn@Vc7;O;$;^qPD0nsWB0oxLil zSxfb~s>v#~zV{Dx$$?AFpEUyt_13mU;ihZJrS|})ES|K*qgp%}N26+Kl-ghkeXeTF z;`N5%ed6_MHm`R$B1A(Nw|l)OqP|d`V)lB4h#wEjicbmphec&5B4~`PND}r$3BQLV z>|OftvBx5jkbZJi{*X_WbpdHNa9l>jG&u$dKgJ2bi2P3gr;RVecBpK}J+3B4Z{})~ zbvXpheEqzz@WLXqu7RcgIrQ1q9D?pb zH))E%kOE?E7czmtdah~Sk2#ta1c;l=?f%=ArI*v~J!Gs1uxQQ1&xk;AR9_%olFKGl zL!+7T8x(H8W>Utm7nJRqW*rn24|%3aZs6DNIoX^J+-<`E=a_FTFX>10vh)M!;-`lE zO91f7qo@q3I!(R|syn`ZZ=tgUHl>uFPt?V+fq99kbfB;(&a Ifl4O%KU0TCSpWb4 literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/data_loader.cpython-312.pyc b/eval_framework/src/__pycache__/data_loader.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5414c43dc9dbb93dcfd81190714b5b1097f72b26 GIT binary patch literal 3708 zcmbtWTTmO<89sZ{4S@v6T!b%+!ND5a$eH+7>LwK-I0ZtR;OUGt?WnSL1;Rp$cUQq^ zMB{kKRHP&(Z4+=4w{qR_C?yU&xT!pLT9Svp^hIjOm~1>#&lFMoR24Gg8Rw<{*_9T7 z-S(kJqqF}x=R4myXV3rr|Nf=C+=8I|`Nolfb7qA8jvK{77mW2EfiaCZ#1UcS)t-cx zfHoN>hbS*KM0@EW#>)_hL_9HQEfQ=zdE+dhwY(-0oj@EljyQUjEcBU22^Vu8H+o%^ zLidCuS)n^Z0a>9ukJfIjvUXF^9K;d<% zTQ)*X!GC=idZ&>Y$@*vI zl641ZMxt}msGz$2YZUcHjir{HDu#!LQ?#BZ7239i;F$elto z!aXO7g4o7thgnX)k-I*Vdv7z>pd>_E@oqc#fWYyE8S3_vw{y2I-v7m=)wwx+MTHP! zOjHgYIOZZ06CZ&&p-6vk%+VJO1b9gbM#JGzHjIVBu`w$flKCNt4T{1L8?)oBu@OOL zgLrdoY-}4|hvn3jSsrhe4Mo@(*~Wt8XknO-C^X)nLSsh75ElCTc~PN5k)WVZVHmD_ z5-GMVhjP&(4lW9fFG-;}(6d7D5|$8trw^ZT4+ulNJ9;`Ak)!Uz2T!_tPMvhW z;FsIslml-5Wq;Tg6#YZ|h#;PEOJcx{DSYZt#n$0br2?Biyj>q2iPdj}jMmLVI3`Qi zp*oN9J1yp_yOrCf29i{&f3fmN9ueg&>E^q&4b!7@7S8Mwx-RXgY$Lsmh|rQi|J=G_1|8!dI0)$o&lbR5{`QpDm*Nj zs_{6eYX`vk#KCxW7(b~uVw*kGewg~yvAz8u_31$dY`7@6{!Out0S-0R{R)g})DQ9P zH^dkbCpyp%S%k*OI2lJT6B03Lk^!NWK|SIXdk`e2I0@P=i5mnOrD)JM#7qWf1z<7a z*i~$aHUEQAPhyw}NB!tkpwV5wG@{?Xa&u*7E;n^8H~Cs_YBu+$*H*8+n!7%|^5O4w zyly-iC-7?lS*ke%sB|QDM7{CY;GnI#^4`SC+X)@%q7~EGC@;w&kcu%dAcR1QioqX| zMnElwVNl<&9IJaFBuOAvR^-n{Ln7!i7~;bq!%l^k`E#O5MjK@FWm!nU-nc@0tK9f{|bY>t_ZzuMTl_L}L6w<{Q1f&g4eEIsW3^J6fBO;SDcyJDy03z-dl32oaF5Yr z4Deu8Y$F3jj*L`ARzm?hT8q~A+#srRt(~u9z%T$pS<~&QIl3C>& z=%U3@vYuPx`b^-VOZxRwK|ga@-!*(CP<_Z{h^Bx|_te7Y(an!a#iWnw#-obCK^uQS z{a=|?TQoQI=6_1Av4-zNRJUVs1Xq(6>zBdfIsP0Qv+QT3Goj&OaP*kv`vvEPEN5bc zsLTpMHfGTBoy#OT;na$mk3@%f5nNn+0=EtRVMuA5su{5fdc;axRYB!4C{!RUsVPdV z#bArN5mg%s5mI#};qN!l4-`#8HELe*M>yXmvBVlnWb}B*abS|Bpn?=sgY1wC8nd>> z>|-)%Cf(J8#Cbzp^5;?PA$P+0@`~tCQZVyE5#P^L4kI(%daqx?!O+ z?z9E5&hOuq+`OZfD;lhZ~=i`I`?v`Y~ z910Ba@_@i8lq8EH)}2V>3ad-I5c;$rgtfaU3xxfWq)iZ80r5UkmsVe_h~=hM*pPk- z)j0ZsK6;O-%#(FYRkAvd;E{5qxcR5j)cmpcBU#tC)^OiCg$z|HD1S(`9E>CBNCr|* zB?qqT%p+*dcdX$@0SQ)qKQaKZgK?$;7~!1fATm)4eG8_WPiK#GVYm~+EB}Q-+CK5l zTo$nc9*Plv7=c?V26+^_=a$1O=6+u8QNNu;H{dk~RL^)6H(PLpWAa1OaqUe%Eq)W4 zH~>j_LY^WB;!9NaC93!v+MPwa@0p#6+B;@v*6f@)_L;f)yx{@eMG($4glm=X-#c5% Awg3PC literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/evaluator.cpython-311.pyc b/eval_framework/src/__pycache__/evaluator.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2ac9afd4c98d8fcdf945ad0642ef8892e659b09e GIT binary patch literal 5717 zcmbVQYitu&7QW+oc>IdvkU$8K7!n>qtpjbzLV(ahcvPZ5fnC7c$~B$|F&#gc8Iuq> z+pLO;OM{>-EGdP>t+K!_RYBTTO-olxTPk(`jnPP0vqI`>>jZwS5v+vxwdc;*<42&Y z-RsGj`#AT^z2}_!opW>1<+39vw$D5JHaQXcCvFrARjAw_fy!ybA&wA`-*^*#3$!f) z8KL}Cg!a=B#?M5oek*~;Nr8>9eirH!M+>%y-EY_HjNphk{Z0Z|5Rdk|iXY-@E)WP^ zhCieD-4?VFan{3#V=q`>1pW#mwH&bnwcGCE=9YF!qc(>nS)+QwAz33|h#u1Dor$=> zw-XvAAK)T)u}o;#HgA4uoe<`uGBi9d@v;~WN$Y|_C?N#pm>4po8x+)_m-{#1bs84x zCpZM#CHy33;VF*f>0XMX4pV-Hqj@V&^K7qWW?>RS9K+i<>(u^UhGSulovY#OykqKk znFNO}!Kw4O%4PrjymC(Xn)5L2ui;#vF1Kb|e=sN{@KzITu+lqM4*&G^)!YY{{`K`p z?%44=SKpjC_1RD3e=_Wp&D|m}B|^A; ze-n_?C`lwyz#L9mpk>-srp*C#0jihHCUBWDbqiY(@4%qlTPV5Ix!lEfawESpS8fiO zl^4W)QkU7n6H6QD@`3~8KK*zi^~L1nD-}!?JL~dJ{`uPEweu6lPfU(|b?3@4lNFpz z#11RgV1f^gRu?h+C=fCosh!j)i5GfHtPr7kvihX!Z(n27^ zC|`?a%}-Gy?(Ce!DN1FVZqIu*GjEwXZy9d2DEf0BSdm7B!K4};j&l5f#)!O}5Tg~Q zwE~xJ+TCD6Y+8;edFc=d?Q*{-%f_gxF{*5ghB3OV%|M$ewbOWQQ%a?~3|TpA+H|XO zW;kQ0+|817lB$G|o+{U?gq|TbNqz{>lVpL~lFWzb96e-9qWz|Ori-vx{#lg7Jf?eB zesRbS$Dm4ENo&&1+0W8&Ivt76K$Pidxv|5OC*H~(I|b));?&iNW1rsr=u30dtW5QG zY|Sq3)G7ER=r%4Joc!$L+}Ec}VVK2iY}Xy3>L?kjn!NV@#P83VoEm!|!Ao*D7S*hw zzF0WKO8~q4fUMC%a4#=tjDZO2mzy!eRlhRUtRz`m!!-+hcpr_nx?M6bGD0YDid zh9JBo39tc?w15q$cP%3|pexHGV$qxp>aRI*|3qzDj_& z5CX939$Z{HFR&a~qZGiZ@~skLaB!t=s1$jhg1OSuK!%Yrf4A{`xqSTD>!h-v^HM7i zc&<(}-X@1w4>JZWK5siMPKT<9omp(cq#1|-q=p9C8I6`8_{248e=Ho;$iZ-2#|q84 zkCy`?FU4a~iPu4<-649gj0Q|fa%bV@Y)s|=(cJt%oDa!77cjn{d3s}FBq-yaqD|}x z5DD-RPsk!h3CRHj+q_V8ooUW;?F>$Tyw>opq?-l&iZaO`EEA(GEGa=rX?wNo4#;?_FrW-?ov1Ig09T`-73)8d(_!`6n18|(W^8(kDu1(jLJA;yi<0N zG`;C$;8w%Zn+;2^E*|I9?roWdm(_-sjj>rUIXh$4=!+-c$WEW1ZEDLqm|A-t8N_DS z&3G4|VU6N~a!_+bf&&4J#=Iz1;35k=IxcPjdITM@W8WPjW#}%JC!mw=&79>NS&2GC z1!l<@cMBy+RC03UCsla13=L5-UQwZxP$-Sh*OZd9d_q^sT!xO?Q}z_gvel1bZ2T%}qy<|oOto~)KQj9SuKxgP>0sWf)+ zDTSNDIkL7D$&Dq(HYBbZVk<0Zrl}<^$nsbmA_%g))h8=m+c=r?iXn@>< z`UxA1;9q4FAyCxOik2dY>bDqt1lofL`2BC(phcW6OeviGF#5z%y`Q(ye$yY)wNoZR zN3zP7hinzvE600tI_IqZ4#ZEg*8&LdO8gDj%n(z^NR2r7;ag_ES(zFK@!2&)UD*^2 zc4a3TSk4aT#xCZ*_`^?sIXn5)Um@p(;AslR>}KP!F|n|=ks}l7Bf0lR?p(PrdF`#L z5IHz=ha42;aCDy+XLyj#LA3Pthvh!+AnPrF%V6y`BQRk$9Gj$ei$=>aIVb=W@lvRr z5T609R0K~95_<6529;Bm(S?!mU7-D!dA6__buBpXKQrRiN`UBgIUFI-K@*ZD((a|=SS zr(gvNJOp#ElnCM6r8gl6huS?FBgFRY<3&i~qY&`1Ge9oHM>TsW77Zmt5w5zLwKoAd zJtSpyJNraF$ZdzBedpZewi$IR=S| zegQE$#En=S!E4rnByk0F=o!mv;9f>~-pE+kqBoV_iHdz)c-nJd*N51hrlV;MDQk|c zIqo>>Q0muT#l(0r>;{$Hps*WqF3&k~wC4lcS(~z`OL48txK^sJm3dTaTR1M?_RLZ} zt+za#H$9!#S1O*)jAyIr*_uag*Ft5-uk%!e1D0Gj*ZS0@ogb|kTa%}O;~w_w2bi32 z|J#XL+STSIx0+YqY+gMc%QSCMo44eV)w571CpP3QOvA$L+=UnCd_3o)ma&#mCfnF@ zzUE9#`e3Hfr#AYOMqjpZ_Ibw{NBV^iYR=Y-!tdejS#5b_X#gQba=Wqlyz`7xnZI`Y z01$XGjqB9LbxPy9DbukCUChIW5A!rgz;6d$lLvOHZ&xSx@WvltQtEqL4Nrd@7YSO` zRjZSGSji7CDfKau*+3;xxy~bx~Gp?6*s;kORs_Vd_&A#?;^jM*WJ@ee>=B(otOEp zg@F2ZUVG01+jonf>6vT0F`I^-8*^#sxv_x6^~Fw@b)%E&>9XDEB5{2+f$QD4{=JJ@ z@3DSgPXOHlu)YMo1OK0a+kF(S{(*pI4+J7HE+OE$GY~kC2nvM}YaqbILVIW zYlMgqOQbN-(^fJZHK1C=fG*F+>>&0m)C zDU>fy+3C*oiadhX_zwM5O&XhNd%8Xyy7X+iZ>%|wpgB(Juj|zH9Y3^Q-+iM-Z&s7% ziAA(GkARHsOt+<_OY71DV?K4xQoZjw$bg|A*285$+2vPW2`amS>h`?~*QbOdO7wst z9aN$R)o{Pc4Jg|O)Xj(V2~Q-(zVVg;JgowNCxim6u7sAp0$7S`fk$KHLvdg*6lrQ$ zSE&xR#y@zlCBz+==>HJ1!5wb14$I;%apxLLbS+nE{t>&%uyMEe8nEC#V3o}mH+l+QG-(X{D|f%l}{EeP%57vZB3~es;y~sdB)bJ+S-PhJhh7; Rz%q}g_w@b;PZ-s;{2y|Tj6MJW literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/evaluator.cpython-312.pyc b/eval_framework/src/__pycache__/evaluator.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5e2a0e8e4c1975dc2ecd23a5c811eb156211658 GIT binary patch literal 4719 zcmai1eQZB5^MI)&VA2* z4xv@A<@<5&x#ymH&pp3$-e2qsZd3PO8$$oagKVKPoy9nG&LST1goxaFOSnxi zHi=|_a#I1?O$Qh^6EM5Y1fC~FHo&@B=u0T-oG`~0$^QhWS9MI|2%_N(-fNLUnHgi0yL z_`ppp2?pH0efxKdej%v9z_DLYB)?DI?G=3y(W`_cpDtUk;R>{}_zE;7Cw*X1ncq{jo+g2+cbKf zF5_MLxJdbKd1 zzXgxacnmI5rDQ?u%(2S&RG`b!!wub`fY1=>hy;~LL*wQn4g1=UH0<{(yM&;xyFmc6 zcseC-KLYSw9i)D z7pJD#@~_r!om$@*r@mmT7J&n)lppM;(*7VX^r?&_C=n@mmmTrpG&{C|y|5j50zbJJ z2|fL>lG>Ou#*{XuVN7d126_zc^%xe1L8fFa9y9ai9>eC@+2wB|N=K=z@9sfE>=5T8 zyO7WHEEwV$cx^E@#_?>FJ&$-U%AH3Ssne!GJB55?jEk{jnZ09{CHuvEfLl|Z; z28T4WukaULtA(qt&j0Fy!Kt#xB7&^=LqXN->kj#Sf(-a5^eHMWdOHMBWppS|8E;VT z6{ID?xkPLnFu9xqrh2D~mP+B^5`GL425bpGQyo!seix(CGGJ<0rFugCph^z-!ONP-*=1!Y0=J6E3Mz%oj3m6N*$57*+V0xH!m^o0eVBJdu42bB~%M3q!PWk+X7 z3V0PfmpzLm0>Z(!!VyJU2g_yb<@m0QA{An(HaI4nUG`uT43y@aa5){HQ2qicaKutn zTsC4!6;;RCS$om&<1alway(_<5T|Afiig`z+Y{zg!Nxc}>nIuSK3y4SX3Hx_`p(tQ zl-H-q>!->$$L(nbRX8v0J-7E0`SZr)p`T1Wc66rpsrR={Y@H-WiP7rOlF^RwqH+3f z8=9tSpPE|Ro@7_gmN|z95{_i`1IhJ`Ih9UPueHL{AdMH5B%XfJ_Pee`NpQelCSjKOOWl=&g zXV*rF>^x7tWuQYNict!i<@qje?ZIYUQKF`|=&Ur5Kr8kvLv9zq9cIYvBmEe6sx!F5 z#LQ8;2S3FX1&{(imK6msff>snU6gs%G+0Yv>;fr2pfi5wD@xSBwLR$BM!-OhLs710 z)e1Hk8aNC1*Mt3UMTuInXD3jU>cLL5qVPLc4xFHw!QAerceh|rD*_zhnC9{OdS z4)Nm8po=AoL78++j}+%$Fz80--bjJ{(+CF3mSB;~ycSq4oS1v#&AIn}ef^IY7C!hB zBzzEF48fS)7<^~c$mlls;(X%8xmN~nT=~ty)fZL-k%85R6|bcDgI!LX=s;Ej@xip`cJzT&t&$XyoEh}ZE@^fi z@MZ8rvn9?+i{QVAGjqsg;2Q+H!eQhx9nd1kZ*huG^vQUUQ$zYCc8BNdhN*U(|Cs1Nb;^^h(6G$qCA;=}46Z!|%WrW}p`ygo+1qeS* zIpV|7q%$Il{jMUF5kp;Ff&|G~5JD72JBV6BP__6%L0?3Y;Oj#*cSayrgYa8;q+1fa z{J~I2Z0!?#5lH4$P6$Q2H?S3_|1xSWt_)MI~|bbvA#< z`jRzS+AwY%jZN%L7Pn5bZP)W07s!!4=egv@=IOkL(x`x|n^3-TteJ6CrySKYj;55O zY4V|@qiM?VL>k%i>XL__zRjYVO`}I%-+6~7s|){CRy9J5Jdh?)NyXVGPCt>@GgH%; zs%acoCZ3!s-Z|+=74J%uz&LEbg`l}@LshPs%FU_D%`=q`rz#(w2u)QU_y(CBb;E6G z6H`(*yRPodipv$RSB)^UWmT8*&*dkMPnXrtmaV;HJ!efkdNKbN2i$iYpw1lBk%X?+VaS2MfNs=zP5hjo*kBtEVa=8X!HH8Cg$VyCg^`$%L30QI~eGH zYT{a3xKFLF*6rM<573yhoyL?F68fJJbeo0yj4=cMXL)p6sri75xQRa*U3UB-6ojuN zk4Lq5Jb@4&5pmz<@f?eI#mtP^WEaAjP zr8~utSCI~2!u`0yUaUcagwaE*a9$(-0;-ef?^J!7vd~S5?P&zf#NkEUq{}L4OQJa8 z8{L}dzFe6`Fq|N@W|Epb{CV}{Q`hnrG4<9X#75dVawJickVkhX`YzX}Dz>B%&?bQ& zn$KGy=Sv=SC!g^qk9t#wI+A>M(jQ0$k0s^f$>8ynzcBwE7>&HHBPXqmnta?rXH+pImu-)eKjj Y;>t(1O>s3Rm^;)_f^dF=aIKx~|FSZv;Q#;t literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/llm_client.cpython-311.pyc b/eval_framework/src/__pycache__/llm_client.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ddffac6aaffd28ace520018325f36fb672712ed8 GIT binary patch literal 3002 zcmZt|{cjY0g3bBt19B!La0PRz8c6G;7$+rBB8P;;QN^`owYeSJXZAia zyH}gjxwR^qS`EZ5A+;kQn3Mn!sJJR>kg3|F${%n>t9G?gq)6rb@k@Iti4^fu-|XJ4 zeTKQ&_c8Cyn>TOXyt&JPfFD7-b7Sw&EiXd zV~i!foYPa;jI07Jyqv?C-WQTpT^lu2nY+&cF%7FypAsI0Nw7YoG%FnD6dpbSJ}>4K z5eox;#dn-fcojd4HNe-1xdA~59On|E(gdpqEni>XOV6q)oYC{0U>CUZA3t22_+;to z_5c3$#PW^v%QyaT=jJ=Tue@+){^FhaQ-p3{g+|fx%DI$u6d$w1H)IVl8G4bWQ;8?TZg((>R`gBM)g4-}88Xyzg*-{3Tg`24|8(ar|>x zl?I5M#v@sBG_H|kTvgMOv+nL3B#b1bGAUh>3LD(?+g%%@E@^uJjH7SDsBQD?kLN~9 zq5TV?{YGg21b3pr+}HugZ$=)Q;0!Tb!zD2c{Vq&ImOrmy0&C>*x)solX%JQ^N3!Xh zZV8Zzd6G%G`2zYhK6WPl2Qmz&7l>~_?KR-3%Odt{*Y(6&Z>LJs@@O#WnsC*t+W48{>-HFqA5%DNK8fM8ouG?KVvPx7!$B zayhZj5QpB|p0Nz&=&Y*KbeSu@p z10&U@M^?=aH`QpxUt|m5_j((Lk`*J7n{ zo%exr?D|#h64;#oC(Ls%;qp75lqc=N@b1a-{AqXGCGM{ZtGU)4T{?XQis$m*-Yx&S zzBbe)A?W@2*3zx>i<47J7w4C+pLDI$3+6Jf+L^kp2GN>jy9VXCeDP1qH$GaN{>$>; zPnXYpuoV-MC7oMc19q{t1ZVA-$Kps<#g;dj&FE0@EslB~W9^G>iD|59@*vjiny^BX zywDDHE}E28H98=tRIEgI^y;)i>Ct;kyuik~qbu|Z94u^p&XMcc=pfdk1Z%l$M#E8g zK*uCn@I_^3o>mYXA7eqg>JXY47MC27b*mu>_fy664b-4W5iG-8w@884L)lakYnHd_ zD4|!u@*NmWV*BB=1T{N2h>7JLkx3?%8MHV(mBtpYsTk)h-d;H7B{W?~3kCF&Sl;T> zWQjGeEkVv`BVdTZ8ohZ=(TkEGRd0m-!E5I1uMk=-Uu(xVfu^<>LO6~pA=J`3;We8! z&TvL}$L*#arKTMfPcYDBMjk%eyeq+x8z)oKf4;kX2l6c5$Zkrg@=*dp@KGfTZKI`;9%dwyB z>}`Ighy6zn56nS$)o9icdNwg1eM~a&)JT%$mn3*!^D3nqCF#w)tX3snNm8;&7-lt{ zSp0yRg@j{M8J#>tE$s&IF(Ml%O;4H7a%=-yW3{&cjJrRR+gajvRs{6Jo(kK{Z?7N# zv#gD-Xg0d}JsX)@9YFar>s0q9c)+4O^e<0ADg=C`~w-yzS* zcHk{RKbE5meK{;43;!SGl=I;GAT|5!t3l!*B#%;H?{F`r++E)jpLW8xk30neS^>0Q z0;sSI!^Ze2K zo<~4tLQ}Rel{BPD2T__r*0f1Qv@k7N5t+8A+JE~8cWm}vTdC4wKK_~)A}z{4+qutQ zK)PPZ_uO;Ox#ygF-tV5{zm%1gAQ*RU?(e^4Md+WD$qz2KSW{pzffS@LD)MD1<70rg zsFo1xV;Q=~s$7Wo@eHycjDn85ks`&`879m5Y!>taQn*t{;b$zlJ^LUd3agaq5*fqx za#RmQ!m9JAIAiwW>Zd)`Ql?lir?Bwj;KyTL5wOr( zqF7JyKC5DbJv;nLFxM+6rKdQbZO|c>8}`o5m!4AtIIPDSz$$R*-~4)M^poY88~^&l z*vieRm75>jnR~nKl^5^KU%4~?j$xIf0qF!DG;D9k8kS;2HTY0O!KzWJ<4_b6S&tEH zl!oM!62UqNV9hV&0;hx~n1$ckF_@EmOJh_n`y_CNCR(kAL>u*+&78Y%!Pirl3^la{$tsb~~`o_;!-}!KZ zIDh%9d}@CA^2qUHof}llth9QTu3b*Q|5keZ^zzl&4f1vA0Gr-P>(An_zu$|0EUQv4kwbVOLQZ%! z;`gd*NXq(sDhe=>q(C^JOVZGm0@R+$@6l*95vCDz&xva5r@udYawY3gN zNy`2BD7R>Hn)Gir=Nc#@qa>zb0(PS@-6+!rHAtltC6Q26Hv|A(jD-CKgn&N1zKOX` zKoLA@W|)lwrh?Sgrf~u^h;Bz9Eg-I_5-cEX-QIe1=eC2?qITtkWl3V2+vZU{h72RnQC#ihKIXZ&z-9xHR$ml|P+NUwpq2 z6A~c}jRgaQJ^(|=`W7t)Mu!Zx`$vb8VPIIBlqf{hg48?2iy<3V)lMRHN}FsexKr2YlIYzpiB*_svfnb%C}B(NoP~S z+4Na?Q^rzR)|_(hm^ygjVA8!i;ohBeA4s?lB;AJ-?!&3N?Nhrh?E0!foDC(M?Wu+* zW{)PEM>0ISy(VKr4UZ)~dlH^KnKHEP$QrV4+cIv;RHC}Zq)SY=#A^)+*ON(COTyKX zbR9~#4vh<`+WPUbdzHvuamMjW$3kt(YHs;IqxuL4!q z-$fk6R#`jw^jXKa^^3}yiSo1M3k|Jv2j;}vPv7oNb{tQ19A9XEZQ-?^1>X-AdVaXD z<422?ucxZ&CcBgM&58Qv&#IcgsCG^qJ$rQW&FL2|#22fd{O|pY9qc~HxKTw-J`;=O zP4_igQO>n}>)BoC(_QxVN7zsIwpF(~*grdXV7j0uQiQG2x+~ggD+)uSk|d)(|ozTKcnqm4Fkw@s7eo;b;zd6yqt98M2 z3jKrIoe|J?kNlgh;GdXgGYH5v%_MVsO_JLFXodq&J{YA8?F)twf&YtgAlvrs0H;n9&cq8svXf3`r2FZr5d8z& zp==rslY=0juU4Zs%&-i@e2&ULN2OmP_m{}^mA!7XHfgU**y|?0{a5?W5#c{v3*)?t IU^31958dg zzzw`HY6_a7=Ab!h30k7opp_!?M&1^+2klWu&=GY8o$46H(?QyRIQn!$c>&IKg+k~W ze6<#I8_-e2nx+wJzG8qG_?BioEH#7zmM;l97%(ZOfrua~rX!KCs2C5&#ueL(2{96n zF?_(NxD%KQM+87aLO6~&g$A7A`4VOu*{AiwCF$yqwS+wU8*~;Bhk_J~V3$I1)N}>#Yx-Ew3bf{0I^eU_@mbdJ**OPm1v)2Z=I9YaWtB<~RHgiwnmnVwS%o3%BB}M5l6zpa||C&@{AbA$*rL2gWpfz`)ql2_S|av+=CmJ@84J` zyq{4Tn3(V;heHCe#6}eh=ENk9h4r%mM&b`f_&z)jU;(9&0M;2oVinD4z0Vp_sP+se zsT75uNKtA1R6&g1SYxd(q{&KaTCKH*z$0O&rsQoPbofi{-k*P2`1ObPug@2*FWg_c za&P66dn?yh-<>PG`zsh+y)5aJdgm}66%OebkyB%hL;k|-+l9Gn8vFeZrgh0RJUa4{ zYP&j#B@I59)};LNv)RIxpRay+Y4wxKtCwb0XFe}T%MboK52_EwW1KE)aH3`S#C(73J3!LrckjFdA1c1WpB!ReRDkp&*pi1w&kgn zZ8^_T*>hAf=bauh-JbV0E%aXOU3}w0|9t;#?-RGYPvpD-*&E2xfxO!*i4Q>@QY@Md z>wD=jpdem4Wf(=^DTZn2rwQAdR_}X9P`#glkG&6U`l&p)`n%N+=d_c&xALcgwD90^ zMxWI?8~N2CzsCB%_)&?ocE!OpCh&r~4-WuXK%=nnQ;p>HO9Q^P)o%8JVb(WE8r zxeI{mjPz!yKkwZt{VcC8m_Bf|@1wqDV{T`Uyt4Yr7FR_Hu#*F(?S_!ong;K`>m0Q0jY8bgEYQ3rpr7ERDNMtH1A?ZzQD;<@;bU>_M zNQS|KgpnlAj|V)sk?3k+5(zHG;>QSLoQNb8qri!ZA(l`~QI3fzMuugfppRpbV-*uA zn-nvHF*A-y_{TN^NJ>;Je0+42!-^>q8;J)tDaJ4_;2mU!E;|%l>IA5fO6FHFQ$*yV ziVLpBa4!}^;W*p_1iX*1Imk6x^>zX8Cd;VaNsotAanU3%MijtQaT?}SXU>A4I98nM zLwJ3Ru}8Z-5h=r~`VIIv+%1|~Bs%YFmgu4d zZEY?hv%OCR(viHc<+d+y%NNM(%DlDImGd2xeFvpu)x#;dy)WncvF!V?bnGtObenFw zMYpAmxBYu>`S&h2<@|?a|Dhb+E7QGMx;Otw3LpYCT3RL7ogKT={PI9<$6Q(^#bfHvb#La?kGCyt`^(Oiu|IZx)_E&%G#faY_iVrI*?r5i zJI&-gdt}d^toqztf4Jb?I{(ySf6m)3d)p=3*A~>+eBt=~amkV2(gds6A6EgGrLkr4 z;CwtiDmQjXNAt~oxO_FhB!D?{-qW(+y69R=<~+M)&+e@H6pb)P1h)R>NV+B0+99`g zvGbVru% zsGLA$42BE7=J~hMbk5f)`#L51PAyurq2|Z#v~^`tE5q5LVD|KxY#X0zvHOgSF$XZZ5_$Aj>xSecbfd^)=cW=E7{>w*{0WWO|QvKuYF(sqTA+m6cGT)`t=rM zcfadA@62xbQO?#Q+j_FLp1iwJa;af$FyK~fp->F&;-Qe@2!*0?Hp!Dd9SXgXWcbpI zH56jwVKOX;j2H>Sn>Q9`A*_fPle$!mJD8*vE$BGIJR=Ai6(%JNHUm)1BYd0@@gNx> zDN5Br%S5Eo#RQPJBm5n}l>W(^+Onp$yXH+rgU7rj9WEmH$&4)vOHs930bd!BW%Kl~ zN>|(k{i1<3A6R@f9lms`h@hJhm!Df2SGyIUa{)oI633GFR%%F;72jXlF`IC?j?PTIG%t% z#Wefr)=~|txP>!3{P7a&DhIo|2yG=1n*5_?ALYaWHCp2rK`1^^~&b~{w Z@0zj{O(PTq5o&$-l{)^f4Q5rV{u@d&u*U!Z literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/metrics.cpython-312.pyc b/eval_framework/src/__pycache__/metrics.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..fd3282b66c1911c39ef22ab3fe6e17692fe10856 GIT binary patch literal 4623 zcmbUkZEO_R@$KF2eXe~6XMFa>;7fqOrN$0T2no0}6jG>4h!he9F3{7myZ&PDBX9Qr zqdf^#j&ezzVmC?8q$oA5lpHlsD^et@iBf*FQlHjev|*f>EMSUliB zdyqg6A%U52n0vuND(3o{Ap2vqM(^vF6^-8A&nud9XKGOMJeN`W(^6}*Zvaq0B8pQ2f)i44r$`G9krAAtOJrJs+CzhF z^n@u8cLSZVXu##Mxm&s?veRJffM!pfOyi(8=F?U@D=N63 zmv?azpOrWzjkBvj^46b+WXUsGcoq4O~h-A>xD#BB@@@g+(Qc zQ@rU}-DDC|Cp>pf!c;&ilBq#Tu{j1zkL$eVBsf%LMIuM36mD0k0j8eAfCZ-&!vV{m zbxfAW4j_mnot*)aFKUk2kAGeL-Fr7LybO`lr?W7>||J(^7^*{&^* zzR`G{6yMh+9_OS)59X5Mi8MaeC1buTY2;03W>9MsPb!$>m4xAn6?(RlsC>}U@ z$B~r?kt`e_T9yne$3`8)O*HW4r~@!lI&=cX9NDuVV8`j!2Y;My){NMd`5l$es{GF?YhxdDpYNV@me)T1d0@*JH$oNMqwaUSp9Qu| zd$xQ7i%d$zSonufZv{@_`1;B~5=4*UQD7>dUX+8_6up)IBSz zIp)56dX2RCT&iQac!TsZbx36_Icuem>iiz^E=$f@DKsq6jmA>3^GE#)F}&Zqj|oRF6}zv>+bEKv)n>bEq5t*dA> zybf$xVeu&>qxGSBEk0|_dBL|VJPr5t@1Z*XdZ8Nl&1wKW3)tgVnUj; zrYM1iDiEYmu94WgjD+jH5&@84my3pKcPWrj4a;%_tTTZRUS1T7ChPnQtIAs3Ikqlp z%W*c?oDHf+30pc;R&5a60)5dlK|WU)3JI1zmZ3fH9kj(dz2Bj8F>SHVMKr;8&YcS? zZPr@{NHtnAgu1LzgLb|4pRg)7KruG^DTw*Zb|GqXJoP=0O|pS|%QmTEc3*Jr_tnd! z#LIu2<9=PKh|35c-entu5&e z2!mVma6F%d%YCrE@ztg6#@E0|<7+^59kQ-vfsyFcbVdT5T5Vn}Q9hl_WED|}%2^() zlzXz0G#CrwW@5XG%VflqfL9a5d8$97Ib~7N9I1>(Cq*u$IXOXqUMGz~gGNiFcV{?^ zxj{^dDfWoyqF_2v25v=6Y95P2 z^H@}^5&C~<{*?ruhPFw@8;B|&`4`Z=EaL~stD)Ax3kDOqf@D@w`U$2hF)+VQISz`_ zSo7-y6LFhOmQ1Rgm7Ml!+|~ipc?pyE)Pwh4*h^SCL7RLK5<|bUTOxV35^l}2H5Xdd zI*%A{cYb#z9GMBnO5s>>UGe0F&T{yv{NDOKwe(1LIsBvi-WzPo4BK8}+sBQ+*$A`Yg7m5?nJATwe;VALp(GH{Mu;)x02kmqks@g{`Ao$99xMkK{df zU8uSBtrzpYN>j@`a(W+zLp4Xno*GS$_m-MF^Lr|-(RoBQJUqfwf{{YwXyaJ+T5$bs zCv4Ri;J@ob&28s)k4MUF9pzP<^1d6yNxG+I7n!}uG`q6@;OCJ;S6&=viW~pbIeD}c zed_GysaHx5JUtybl>a$Q{{x5ZHUCw%W5I|1YitKO<|a61OQdje^yD~O4sXh{*Tb!K zSj~f4AH3e)SyZPEPalX+4;;JNE?sZiSZteA|MAkaAYN_jx!w{TZ!4--UYb69O(b5)a*|1LClW%MC)={ZDg8X;bYEJ44_?98 z1v>SLfJtX)lmkC6ak6Z1VRAuX(ywVukCf&V{8NB0AWSYE{aeALanUDI4&*a14WTdS z_8ZKKnj^?Gjq^1GOR;ZKzL1Y^8_?G z0jjkj7wCJMV#4E(7wO`G$+g9U7aB`zx7HBZoZ5XQbVa-pDRu9g*Efjhircz?VJk5+ zwgLxX(U?>=nHj{J3G7k^Gw|PSoZmjJ?iei~ACus3FV$(5fu*(2O==(cY4`Su%09gg z;ys`gZzYrNnCA#&JL_9dw-KZr_(dR)|0MmFt>&aC>PzJP60Q6Sb(B%ZS7=uW?fMcm h-tb29;TdnFC54W-r3ur>S}ZqrNgVu<-JMID z+MQd`VJfPam%Bc=YM+h_}P?LVtDX|ki!5}m|^G@Q`Cnu;QIOq%w*?>Wxd z3EisgweNYqp7(h_?)g2>^WJ+lo0)+0kB>GD{@G3te@B(s#1#v-{{@9p1W)i}km%KN zvX{hlL$3jHLy!tly%dT1s309;dKp}%gT|1l*A!xVS*SAt#~THczzJ->p>}o_qMznX zhv{B3zl3M!_wpPb#q}F`^I@jf!dnC@Z$)i932inQY2)p9oLw(f$KwDV2VcWi!Ki9@ zT*c8W?*@sB795SF)BIu>>Ey|7!c+StTCdkbsx0r91vwBBpiDpMmj^utm3}nPC#%$s z@C&M?FB0w#4ETnjM>UFq92LWTWNEVmVT3XT&+S)$oFZghUkpx|H;hr|q5mS{!RbLF zPRgYkK@1sYdt&5y7=5uMA#UJl^g9n67fW0u5w@%%S_z3hWO$z1PaGmiVn0E`FJ^$9 zG0~Smp1B@U;kERwt5?5#_4Vnim!^JoO}5wT2i*F@2Zd{g zOMJNS6g~vqXl+2@{ZoZkFJlgs6z?)oxxZ=i1vp?K+!x^k;Q_CQQt8OB5LW3I{Gp)A zN&bGp7mWCM4<(vl(JI-m8nvZJ=zw5hcek_*MnXbM^x0@wj`+Hit{PM#f zcCbZw&L8yki~f*sC?Xzgk;J~1s2m7N&BO5OkZ7MKPjrDi79Q#6KoZ0q7h$$152iL| zmgKnBJlCpltuqA4tjg}a?sSi?NRxTzO2xS{X`G>;6?b9sHDdY(k-4^oO5c9_?Ly5r zEY3uiWQ8@Hbhl9-(wj}6kef|n6*Q=}zNjQeLcT~;9*)X=r56r;G(6};ZvO?8;}j7i zMMDhK7%A8s@OT{vGWvkt1CzGq(~Yr%tLRPFw#9PuIJE6M)QaF6Qj4+MoUpxGx2q2VA{<`L_Q zp$IPou)iVH!223@HFRAtsHS1RD1>DRjLh=^N%ln!s+4~?pfY&OU9v?b zeI8Q09}d{VsnmcVi%8K#q=hOKjvP{1Nf!Ij3Si7Zv?h}%z{GumYV41QAwReQ;P6;5 zL`63mXVdmEYgv#IiAxc=5A||*ci3HUrJbqf;uti8886O+t2ylK7hMYG@p-kcmWlp6$+kagxDM)DL-k1_K+ zzCoUizChAY$z;XAXG7#;8e`9cUS2F2L5Qw6^sAhU=emi7eLw2)n4=L8kD+?=({qLQ zQ`4WE1aKFU*~hz|@P2m)P&~!<`q?q3Zv7@xg8%H%w%2Qdb#|#oTn>|}O(Cd$ap;?AAQHRbjV60Yw5G zm81NUZ?&=#-M?pR5Ewvyhl#`Kwo1ge@sV6p;z(!wg;hNFS9 zkn^4@{heGTbw2|U#W@4^um-(i50zB2!f%&U8J1s4V)GTp zVvpuAh)u8*@rQ_PLZpNu@gb;oikpFYJ)Gvawn9avCGd{ehlvfS&IP21=BzM^M1%x! z8xRkp$we4at;K1ztBiBDb|@eZq80fykfHa9U63ATMP6|ybUqG`bO#8AnX4wnp4dLR zU8!FCKA+uuAv_jVns?;ZKdJrm)lVk&++m5uD@LC_abWbogu7|n-IRB?DDIZY>ZLOb zRco9ffFOrsMj?4H^+={M$8E@S8x(E>LQyA~^2nteRHb)&JD*VLU}Qj2 znV|5T5LC_DEAc`2pqiiqMLd0_@E61&kAOYXXfdcy73az7X&%jPI~*77lBJf#`IjIfm#g8{>sx!hxe|3zO3EYV-UQm~o0b?F+qMW1@N z@aZY-zKfeNDG$o=+XuUVS*JgKuaLZmxv*WpAth3YS%#_fmBN)bN(@CkneClEbE$Ck zP2B4pX=_*u{u6(ykm{T28`h!-(xVmeq`Epp)RKPydNuDXE`xojmbqZuH$peVTyw?- z-0d#l5j}{J9rTC!pdd;R_aeiud1$1o*x-n35EIhr0l3zy7KnHQeUISpB&rEwY`+XD z&qRlZ1yQwi%c9^9?a~CQmQIM#iWRUyGN7F`&`Yk{ z-ER)0qHn)=`o*{6XXAN$lVWe0bkr8C)r$50ymgggU3DFm>(XrYp`3MV-nvz>ZUt*s zWu7sa7~|IjkZ%x~DJS9v*h|^xrBe7xU?ZUiB46R_Ar#fYl&>%y6}=TVb#ymg*tnE~ z>QUmE2cCv#Ax_2(`q^US!Xb0{XfPA-ZGmUb(C6mOf2>I_kDu4i8v&4}C{h-ThpD3{ z3$JJOej7JqQVz(X_gcVJk8Ct=``&4!YUhu4?Af=YXZI7{`lohw@7vSUJ+k3@i`h0} zXs;ikn%nx(cQIyI3_zUB1i-UP;$GM!020LnszqDAc3ou9ZCWUggl%}}Y)~Qo6b6mJ zBfSa)e6*cl%*m!aU8B%7IlAUL&E?E1@^qa-*FiW8)#MrklSJ2z6%B8PPKVx#oQ)v& z_#h^Ec4^XbeBX7}cKp5x=gM*C%Di*6;#{4xZ-jq_(Q=@9woPH%a`<1E6Pf_QSgAcE zY*XxnHF?FyfO@L5Faq8AZlUeJYJ9t;MaEnX1h7ly&Yh{zC#Mnu*jg1+V9OE5TG zxb@@cUuF8?bDxYu6l@ysL0lm4{L>Ze z3H+NRmgma~cG!4Zno93Wo0a?4&k#`0 L(m1;(!SCwdz~L1q literal 0 HcmV?d00001 diff --git a/eval_framework/src/__pycache__/utils.cpython-312.pyc b/eval_framework/src/__pycache__/utils.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..487e507d12e013733986882228641bce43f08dd7 GIT binary patch literal 13397 zcmcgSX>c3YdAqa2QX-pvtg#nRHWpY{(z3wWA#tJ z?=6lc1-t2Z>PO=3JHK~*_j~_RP{2_T-u~d;Bj=ha>fbSw3Ho&6=4UiTou_z;r(M*b z{G|tJl2;9?$X7kcz*psB$Js%a#x`n~W?Va{C27W`8`lr&$GJgn+%RY$Wvt6MZX7g1 zTH`3-wGO?bz`>2EDq+wOhS!~B22H$PSqg0oj)HVO2leJV>IQ*I7Uk*DbbJN0sNgH}Lx9zN733=DSGfu@&Y$6{p{9zj=WAe$TKKKWj#YP(9jxXX zR;>+pL|(ZS>T6EWw#LtK5&bq=((!h$!|NP(K$>~f?mc2tOU!QPuvb!dxKBt-uT%75 zGq3%y&jn2-!?4Fa;v5~CfVPrGaCm)!dl)b!8#Ito!|&#Ch|W{qY+V2_iC0}z&tNTt zZPX;~O{XbpjFC%~ae$sl_nuVo4F1iiAp8%2OaKV5ASr3_TvtFCLC^wIbk1n zNqW&f;uvyy?7U4aAn{4`h@_FHBI5SI#BSfe<%nn8vBh`T=l1%xY`g2gmfnL0w)EM( zI|1yGEskgGuAvdZKJGZ?5sq#Vh2bqeuhS)No;V>H5cH4?PbkApfuGn0QINV-N^z#p z(Qs?Dd``bLMbT_y?6Jin%lS2D*F@-KQA0@cg&GPk8;XK`U$B|fZBYl^&y;OZzr)_I z6-ptq0tIl^_`A6uqVp6UJQDCE9Djg*h7z~{q_9L_GpPY!9i&bJC2Fi2r3Vzq14o}m zh416xo`MEskFUJ{qP*`{X5YH@;=4;{e!3jJu=KO%mqO>SU5Jt-vhj+L{i6viuQ@_V5msrIK$FOt6>ENwScN)KZg5B%% zxPhv;C+setqf-z(f=xxXQQ`nx`mjXAb*ojdz(*<@b$Ewlm~haZ@yvnNLS+T~#Jvy& zsZ=RdToGo^)&}>aG?cDzx-FrtjckZM`ANgQ3+BpjbHdyh*EL?&7f$a<=+`V1R)x1G z3LE3*rns)@im4)A`B2jIa9sEBO#rv@P?0{#q5cOXa)&A(@slC%?c!Qu9bz=w_lOCRzawP__aAeDgUD&WCNYPaW@q!Ycu2*LqmK8ldl zP+*gZTq=;q%^+U`c5TG2dNL6r6ZF|hNL;!(fDyIO45_$ktn@8*tcM=2@fXB&B5*kOT)ZnUBa?1zJB{`Y0|PI*ndky z6!r6}mYJLq$z_50^+=vOzc#B9dkJhj@Z8q5S(SV= z+u)AW#Z%-}%;qve%Z}wd#-PU07qiF8DEufE6TgKv)ZdO7t-S<%M8^=yTkBSj#F8{J7ZzHuUm-*iGbVaK6Xbp46QKS*h~+ylDuF1Y{mJ~& zDB7QK45cS6bVL20VVfr)0!6NvV)dblAN0>Nl?kSDo~cVPbx|hfjWcy~O#2mmLFhSzJyyS{GoI0()=#ewZ;d<_d;C+~1E}e>e}T2vF}f|H#QeLejtcefD%g&C zt)v|Xap4>m^W#+mi19xiuM}oM4*${^0K;a(D}ZgUsfZ|7zf%=gX^r|~BmhqZ1pvUlxR)v5%z7IfLVW!CQa9P`dAC#5zzpx4=|{9*iEwD2vWQ2V}d1oe=li^syO(8jCWyV@~f8oRM9|_Enrz#`knvVIRP8 z!cRN}5yAJ;im>k7j`@=2L`m~SK6c-Xdwz3AVsl4))6U;+inlzPD0wv4drL=^tvP?_ z?4ff{&s#PnESr*+Eej=8*I9L?CeB!HacKXz!n>mD=k#|YWpvRodzv)j#f_22qJJ^# z`&8G7w9!Q)adgp09bNQo(Fim7T|?DQR{b%{?lfs7gWcuI5aM4*TsM7Py(d~c{^prlJIjfp7GSs9N%GrRHXGUp1^e0v)z#D*G zH2fwt$58Qk7pe#aF2#Phd^))N>glxN^YTeq7F_%A;_?eW2eoLo-D~ecyP7m7(Mknh zapXXd`C$D?*5%UFsio-|Wq+BtL8DnY1&QrLQ&T1`6c@KSY+&8x0ndOZ_6hJlKr<0cN)%VeCGD`sdYAt`;#U zQ)5WIpfjA|PIJ@k5q+X$!Vv8FC!j2=u_ws^NIS!6!}P9CbrmT+ zfVpL&IOCa`(>2q4QePe13B#rHqe=Z5l3&;S#^8m)&n%CGhhwd?8i2z=raS&+Crv2C1~mNlog3Euyz-RRGq)DOd{^;COwOwQyXiSBzSk9kmEXEh3{9>nZQr4Ail8nN;o@DCR8}%Q8K|Ph0}?Kz(CX z2$b;Uup1S*d3??J*Lhnq-E+cgvNTZo6_6`eLdIQI?z4izHfP}GzI=6neT8~0>)TQq zNn`oi*96Qf+n-RSMHYAL4nOzQP1*ceZ5B=K&6Io5m?fc!5-L`Vo=0`&t7fta!9i-u zeJtZeul4;3fDo}KTR}<*1+r=R7tbvR-cfx9B3WAr+BkM9Cv+&n~ z7d9<-tX_{5%_nrirh9`Mo~XZR0VjeG0|_c1b>@UCsK}_rN*u~9MT^gIyMzH!HS8G& zs}A&F_g5>?ewJw5erEHq_>4^jVj0D*fQ%HXj=RL;k`Au~CQcl8fnQkTaF06O4oN#8 ziw)?(iANkDuF>fvpdKvg_S@Y@;W|O|Z_4zvx;7?EsH8n+7u=A@ z*#bf@v=g4h2yIdUty4)0PDvhalZdgKvkrv&u+jjhlT|@9&=NLBI6!L1jK3p*_9DRD zlV`|*E0D62-8xhg*Pjso(kBUzBTzv?TSLK~K@@ zpF95juTzXy0 zlxl*#DI>M^t`(XYm)6WKmp4Xw66Kp0t7^lp=>3T`+mcoHg2Ats7KIvP8cv^*5-Unr^xKbt7p1Q#-lk?of8;t;-DT>3GU_06iDSD< zw3o`*?mF$IYAu#Dtm#p!|5-&tEFdkeu$K)`KG2m|`ie)}`ItX}ls000A06R4Ar8GbSpD?Ji>bw@V22?{T zpMMIJH%5L=kW;h4R@I=ynMsRZo_$qmS8rpesY9UDX6@yCc5y(R={>0pXtS0s4VJFP z_W}SX(r_vC%F_Gq$!6~Ii$4Ot`fd2D7hhj`<9S)Yuk3(4i9o*77cbkg zWRN@TS6OJP$DQsjkZQ<6k9fco1ve>}q##dhjDVNMlBSRN8YPxokxC3oJFrfPP+^cq z5-BJg#I|aeXIN4@L3*R`5FUpJnj%&awk_DPkul3y(X@RQi&!iYWj8;%mF~1(g<}W= zqib0IONii7s*L(N(-2u{S+MtVvGoSUvF4EGa$)iGy=S@e=CkIAZH&7nOD#tNDK7b+VUs;mo@)vuSmR2Jc)U2*H(iON=Fz@3o|iJDExip|&c zY`y6}DYn3TT@AfcdKd=i9=KD@*_yB`S=0jj$zTDV^yQ7U#2`h-`x$O$1@kfk(F$5a z{|}+HJ%IO0<5OuQ=Dby)0;h2ml*)7pZUnM82&Mof8WlAPd_KV{u6?sOk5|X}9)Ac* zDB0^+Fj(0f^qdkNKq&cCTP?5S^%uDr^ua+;%pSQ62{}`hHxN^CIyVAuRP@NLw;*q+ zDOZ|*Ho&t$tx7_9;*P1e-QlRKR;IzM}x0 zLpHH|`rOhBvHx$=!SCQ6{1$@0Zs760zK;C|t(&YJy}j1HeY-n*2dumH^*!3LzXvXz z{QS2tiwFIdmDUaK6yY}WM8Cfp%Hg4alULe4=~{^y6`cHS_2zp<4!j9Bphp~b-XXw^ zA-wK#IY!3eLBR;T7V=_BFCOt6lg*S-I7rytP2WQBSMBT^IADFWbAQ*q{e7ML2dp~} zTKhT=?C;st=I6hK32Y()%CBy}`!2t#%?gQDNLZV1(>9x4VkZO?GiVVB9*mIV5TQg2 zAGrPFkB{4r3umBMVtwFe#itYKQe7pEWC6q-hSwVMs~$;C;}Z|Zoo^)y$hbopUH<|A zgeR7ih<9$F>zgQ+3vHfbDi;}joNJk5HbNq_{$&3j4P}eEg1BkzoNgUx`FRZisBW3l zZIr87ZmM}f2MT8KdG0K?P*lE9RJKrDcFUkKXl_v&Rtq|zQe_9|A@`iFmW)E2uTU^L zWx8x@d8g{ls?RF-MjB({tSh;F&&!SR%Do{IM3>8&Zcu8_GeM27tPXd*R5f3@AyK&@ zS-BA;;cC$r+OudZntnpINFux9@D@ZBZHo=ZH?++*%nn{s$J=`^b;OJNKQ-=4X%Q^k z>cD=iKh^)`4Kr2MoA$@Ha=Wd}$2%B^CD!G52JA^rzPcKM7cRuI#g{ZIJhNHhcmy^Y zUvdP&2L!JcAt#62>!rL3rCO;_g1}4*A@CdiU*9&d8PNis;5?)AppL>-srEHB@vt#D z6*HeFH+RA5&0fia()u-GA)^!rXa(fr%r4}G1%UwK zowgZ#Xv`APaqaEj;H%B^f4lV2dHJ0mDJIcMHxoHg@C1>jSKoVMDKtY`p#aJqbEd&^ zaR|C&JQ!BshBDMYgd)79Cc1oVeioo;NHly`#lapjnMIPJ&qJ0!<f2%3ZXA6YA&CMz|jv>WY) ztbWT<0Nxqj1UyY8E+647XuJ(@PkadypaugcD2{6@7c6^29B>wRe^CB|zImoR!IVds zD7-u<2g?&GiYm_=&l=B}<_p&+3fD&ql7;sMdlttfsIj1Qz(busr} zS}m&~k6~84P{$mRj9C*CH(Q6NtKmpF-RZ)bJ`6Wo^csFb5;F2dnS_n8bRWULZ;|Cq)J#WGJ!&E-jVQ zn`Ut+`lk%Sau6_Ly1Jo+o=zd$R&;`>Fkf+N$1Ku9F zdAlgD7nusa3H2UvIS;4bCV5Ywvsge>6Hq(}crPh$3f?A=x8p)BrnX{)msv0zJSHWN zktCL=j^v>RF~Mam2|2lJMPI-a@({97Z&LExO7wBw*0lqQ`CN`2!mlBVm@U2uG5DEi z`uCK9{F{DHmHdIK{R36;Z&X8qYJfC2)ac^yK#GD-+=N57M;pl})`TOsr)rAnE;`&479*Wuf1+wbio$}}u>76n zZ<2hfxsEnPo1$WDcl5-YTT&EcXAghm`nBsOCR01!#c$MHs7X-UwzF5eSTM;?!GiRzXV1^HR%0iR3E JE&Pz9{2ys?;=KR> literal 0 HcmV?d00001 diff --git a/eval_framework/src/data_loader.py b/eval_framework/src/data_loader.py new file mode 100644 index 0000000..9a97abe --- /dev/null +++ b/eval_framework/src/data_loader.py @@ -0,0 +1,81 @@ +import json +import logging +from typing import List, Dict, Any + +logger = logging.getLogger(__name__) + +class DataLoader: + """数据加载器,负责加载和验证数据""" + + @staticmethod + def load_json_data(filepath: str) -> List[Dict[str, Any]]: + """ + 从JSON文件加载数据 + + Args: + filepath: JSON文件路径 + + Returns: + 加载的数据列表 + + Raises: + FileNotFoundError: 文件不存在 + json.JSONDecodeError: JSON格式错误 + """ + try: + with open(filepath, 'r', encoding='utf-8') as file: + data = json.load(file) + logger.info(f"Successfully loaded {len(data)} items from {filepath}") + return data + except FileNotFoundError: + logger.error(f"File not found: {filepath}") + raise + except json.JSONDecodeError as e: + logger.error(f"JSON decode error in {filepath}: {e}") + raise + + @staticmethod + def validate_data_item(item: Dict[str, Any]) -> bool: + """ + 验证数据项是否包含必要字段 + + Args: + item: 数据项 + + Returns: + 是否有效 + """ + required_fields = ['question', 'choices', 'answer', 'prompt'] + for field in required_fields: + if field not in item: + logger.warning(f"Missing required field: {field}") + return False + + if 'text' not in item['choices'] or 'label' not in item['choices']: + logger.warning("Missing 'text' or 'label' in choices") + return False + + return True + + @classmethod + def load_and_validate_data(cls, filepath: str) -> List[Dict[str, Any]]: + """ + 加载并验证数据 + + Args: + filepath: JSON文件路径 + + Returns: + 验证后的数据列表 + """ + data = cls.load_json_data(filepath) + valid_data = [] + + for i, item in enumerate(data): + if cls.validate_data_item(item): + valid_data.append(item) + else: + logger.warning(f"Invalid data item at index {i}, skipping") + + logger.info(f"Validated {len(valid_data)} out of {len(data)} items") + return valid_data diff --git a/eval_framework/src/evaluator.py b/eval_framework/src/evaluator.py new file mode 100644 index 0000000..2d4254b --- /dev/null +++ b/eval_framework/src/evaluator.py @@ -0,0 +1,98 @@ +import logging +import concurrent.futures +from typing import List, Dict, Any, Tuple +from tqdm import tqdm + +from .llm_client import LLMClient +from .metrics import MetricsCalculator + +logger = logging.getLogger(__name__) + +class Evaluator: + """评估器,协调整个评估流程""" + + def __init__(self, llm_client: LLMClient, system_prompt: str): + """ + 初始化评估器 + + Args: + llm_client: LLM客户端 + system_prompt: 系统提示词 + """ + self.llm_client = llm_client + self.system_prompt = system_prompt + self.metrics_calculator = MetricsCalculator() + + def process_item(self, item: Dict[str, Any], index: int) -> Dict[str, Any]: + """ + 处理单个数据项 + + Args: + item: 数据项 + index: 数据项索引 + + Returns: + 处理结果 + """ + question = item['question'] + text = item['choices']['text'] + label = item['choices']['label'] + prompt = item['prompt'] + expected_answer = item['answer'].strip() + + # 格式化选择项 + formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)]) + user_input = f"{question} {formatted_choices}. {prompt}" + + # 获取LLM响应 + llm_answer = self.llm_client.get_response(user_input, self.system_prompt) + + return { + 'index': index, + 'question': question, + 'choices': item['choices'], + 'answer': expected_answer, + 'llm_answer': llm_answer + } + + def evaluate(self, data: List[Dict[str, Any]], max_workers: int = 5) -> Tuple[Dict[str, float], List[Dict[str, Any]]]: + """ + 评估数据集 + + Args: + data: 数据集 + max_workers: 最大工作线程数 + + Returns: + 评估指标和详细结果 + """ + results = [] + + logger.info(f"Starting evaluation with {max_workers} workers") + + with tqdm(total=len(data), desc="Processing items") as pbar: + with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: + # 提交所有任务 + future_to_index = { + executor.submit(self.process_item, item, i): i + for i, item in enumerate(data) + } + + # 收集结果 + for future in concurrent.futures.as_completed(future_to_index): + try: + result = future.result() + results.append(result) + pbar.update(1) + except Exception as e: + logger.error(f"Error processing item: {e}") + pbar.update(1) + + # 按索引排序结果 + results.sort(key=lambda x: x['index']) + + # 计算指标 + metrics = self.metrics_calculator.compute_metrics(results) + + logger.info("Evaluation completed successfully") + return metrics, results diff --git a/eval_framework/src/llm_client.py b/eval_framework/src/llm_client.py new file mode 100644 index 0000000..6ce60ef --- /dev/null +++ b/eval_framework/src/llm_client.py @@ -0,0 +1,60 @@ +import logging +import time +from typing import Optional +from openai import OpenAI + +logger = logging.getLogger(__name__) + +class LLMClient: + """LLM客户端,负责与API交互""" + + def __init__(self, api_key: str, base_url: str, model: str, + temperature: float = 0, max_retries: int = 10): + """ + 初始化LLM客户端 + + Args: + api_key: API密钥 + base_url: API基础URL + model: 模型名称 + temperature: 温度参数 + max_retries: 最大重试次数 + """ + self.client = OpenAI(api_key=api_key, base_url=base_url) + self.model = model + self.temperature = temperature + self.max_retries = max_retries + + def get_response(self, user_input: str, system_prompt: str) -> str: + """ + 获取LLM响应 + + Args: + user_input: 用户输入 + system_prompt: 系统提示词 + + Returns: + LLM响应,失败时返回"error!" + """ + retries = 0 + while retries < self.max_retries: + try: + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_input} + ], + temperature=self.temperature + ) + answer = response.choices[0].message.content + return answer + + except Exception as e: + retries += 1 + logger.warning(f"API call failed (Attempt {retries}/{self.max_retries}): {e}") + if retries < self.max_retries: + time.sleep(2 ** retries) # 指数退避 + + logger.error(f"Failed to get response after {self.max_retries} attempts") + return "error!" diff --git a/eval_framework/src/metrics.py b/eval_framework/src/metrics.py new file mode 100644 index 0000000..dcfb93d --- /dev/null +++ b/eval_framework/src/metrics.py @@ -0,0 +1,111 @@ +import re +import numpy as np +from typing import List, Dict, Any, Optional +from sklearn.metrics import precision_score, recall_score, f1_score +import logging + +logger = logging.getLogger(__name__) + +class MetricsCalculator: + """评估指标计算器""" + + @staticmethod + def extract_answer(answer_string: str) -> Optional[str]: + """ + 从回答字符串中提取答案 + + Args: + answer_string: 包含答案的字符串 + + Returns: + 提取的答案,如果没有找到返回None + """ + if not answer_string: + return None + + match = re.search(r'\[ANSWER\](.*?)\[/ANSWER\]', answer_string) + if match: + return match.group(1).strip() + return None + + @staticmethod + def parse_answer(answer: Optional[str]) -> List[str]: + """ + 解析答案为列表 + + Args: + answer: 答案字符串 + + Returns: + 答案列表 + """ + if answer is None: + return [] + return [a.strip() for a in answer.split(',')] + + @classmethod + def compute_metrics(cls, data: List[Dict[str, Any]]) -> Dict[str, float]: + """ + 计算评估指标 + + Args: + data: 包含真实答案和预测答案的数据 + + Returns: + 各种评估指标的字典 + """ + true_answers = [] + pred_answers = [] + + # 提取和解析答案 + for item in data: + true_ans = cls.extract_answer(item["answer"]) + pred_ans = cls.extract_answer(item["llm_answer"]) + + true_answers.append(cls.parse_answer(true_ans)) + pred_answers.append(cls.parse_answer(pred_ans)) + + # 计算准确率 + correct_counts = [] + for true_ans, pred_ans in zip(true_answers, pred_answers): + if true_ans and pred_ans and set(true_ans) == set(pred_ans): + correct_counts.append(1) + else: + correct_counts.append(0) + + accuracy = np.mean(correct_counts) + + # 构建多标签向量 + all_labels = set() + for item in data: + choices = item["choices"]["label"] + for label in choices: + all_labels.add(label) + + all_labels = sorted(list(all_labels)) + + y_true_multi = [] + y_pred_multi = [] + + for true_ans, pred_ans in zip(true_answers, pred_answers): + true_vector = [1 if label in (true_ans or []) else 0 for label in all_labels] + pred_vector = [1 if label in (pred_ans or []) else 0 for label in all_labels] + y_true_multi.append(true_vector) + y_pred_multi.append(pred_vector) + + y_true_multi = np.array(y_true_multi) + y_pred_multi = np.array(y_pred_multi) + + # 计算各种指标 + metrics = { + "accuracy": accuracy, + "precision_micro": precision_score(y_true_multi, y_pred_multi, average='micro', zero_division=0), + "recall_micro": recall_score(y_true_multi, y_pred_multi, average='micro', zero_division=0), + "f1_micro": f1_score(y_true_multi, y_pred_multi, average='micro', zero_division=0), + "precision_macro": precision_score(y_true_multi, y_pred_multi, average='macro', zero_division=0), + "recall_macro": recall_score(y_true_multi, y_pred_multi, average='macro', zero_division=0), + "f1_macro": f1_score(y_true_multi, y_pred_multi, average='macro', zero_division=0) + } + + logger.info("Metrics computed successfully") + return metrics diff --git a/eval_framework/src/utils.py b/eval_framework/src/utils.py new file mode 100644 index 0000000..652f4a2 --- /dev/null +++ b/eval_framework/src/utils.py @@ -0,0 +1,360 @@ +import json +import yaml +import logging +import pandas as pd +from datetime import datetime +from pathlib import Path +from typing import Dict, Any, List +from tabulate import tabulate + +def load_config(config_path: str) -> Dict[str, Any]: + """ + 加载配置文件 + + Args: + config_path: 配置文件路径 + + Returns: + 配置字典 + """ + with open(config_path, 'r', encoding='utf-8') as f: + config = yaml.safe_load(f) + return config + +def get_models_from_config(config: Dict[str, Any]) -> List[str]: + """ + 从配置中获取模型列表 + + Args: + config: 配置字典 + + Returns: + 模型名称列表 + """ + api_config = config['api'] + + # 优先使用models列表 + if 'models' in api_config and api_config['models']: + return api_config['models'] + # 向后兼容:如果没有models,使用单个model + elif 'model' in api_config: + return [api_config['model']] + else: + raise ValueError("No models specified in configuration") + +def generate_output_dir(config: Dict[str, Any]) -> str: + """ + 生成输出目录路径 + + Args: + config: 配置字典 + + Returns: + 输出目录路径 + """ + output_config = config['evaluation']['output'] + base_dir = output_config['base_dir'] + auto_timestamp = output_config.get('auto_timestamp', True) + + # 创建基础目录 + base_path = Path(base_dir) + + if auto_timestamp: + # 创建时间戳文件夹 (年月日时分) + timestamp = datetime.now().strftime("%Y%m%d_%H%M") + output_dir = base_path / timestamp + else: + output_dir = base_path + + # 确保目录存在 + output_dir.mkdir(parents=True, exist_ok=True) + + return str(output_dir) + +def generate_model_output_path(output_dir: str, model_name: str, filename_template: str) -> str: + """ + 为特定模型生成输出文件路径 + + Args: + output_dir: 输出目录 + model_name: 模型名称 + filename_template: 文件名模板 + + Returns: + 完整的输出文件路径 + """ + # 处理模型名中的特殊字符 + safe_model_name = model_name.replace('/', '_').replace(':', '_') + filename = filename_template.format(model=safe_model_name) + return str(Path(output_dir) / filename) + +def save_results(results: list, filepath: str) -> None: + """ + 保存结果到JSON文件 + + Args: + results: 结果列表 + filepath: 保存路径 + """ + # 确保目录存在 + Path(filepath).parent.mkdir(parents=True, exist_ok=True) + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + +def save_metrics(metrics: Dict[str, float], filepath: str) -> None: + """ + 保存评估指标到JSON文件 + + Args: + metrics: 指标字典 + filepath: 保存路径 + """ + # 生成指标文件路径(在同一目录下) + metrics_path = Path(filepath).parent / f"{Path(filepath).stem}_metrics.json" + + # 添加时间戳和其他元信息 + metrics_with_meta = { + "timestamp": datetime.now().isoformat(), + "metrics": metrics + } + + with open(metrics_path, 'w', encoding='utf-8') as f: + json.dump(metrics_with_meta, f, indent=2, ensure_ascii=False) + +def create_results_dataframe(all_results: Dict[str, Dict]) -> pd.DataFrame: + """ + 将所有模型的结果转换为DataFrame + + Args: + all_results: 所有模型的结果字典 + + Returns: + 包含所有模型指标的DataFrame + """ + if not all_results: + return pd.DataFrame() + + # 收集所有模型的指标数据 + data = [] + for model_name, model_result in all_results.items(): + row = {"Model": model_name} + row.update(model_result["metrics"]) + row["Data Count"] = len(model_result["results"]) + data.append(row) + + # 创建DataFrame + df = pd.DataFrame(data) + + # 将Model列设为索引 + df = df.set_index("Model") + + # 对列进行排序(将Data Count放在最后) + metric_columns = [col for col in df.columns if col != "Data Count"] + df = df[metric_columns + ["Data Count"]] + + return df + +def save_summary(all_results: Dict[str, Dict], output_dir: str, summary_filename: str) -> None: + """ + 保存所有模型的汇总结果 + + Args: + all_results: 所有模型的结果字典 + output_dir: 输出目录 + summary_filename: 汇总文件名 + """ + output_path = Path(output_dir) + + # 创建DataFrame + df = create_results_dataframe(all_results) + + if df.empty: + logging.warning("No results to save in summary") + return + + # 保存JSON格式的详细汇总 + summary_path = output_path / summary_filename + summary_data = { + "timestamp": datetime.now().isoformat(), + "models_count": len(all_results), + "models": {} + } + + for model_name, model_result in all_results.items(): + summary_data["models"][model_name] = { + "metrics": model_result["metrics"], + "data_count": len(model_result["results"]) + } + + # 添加模型对比表 + if len(all_results) > 1: + comparison = {} + metric_names = [col for col in df.columns if col != "Data Count"] + + for metric in metric_names: + comparison[metric] = df[metric].to_dict() + + summary_data["comparison"] = comparison + + with open(summary_path, 'w', encoding='utf-8') as f: + json.dump(summary_data, f, indent=2, ensure_ascii=False) + + # 保存CSV格式的汇总表格 + csv_filename = summary_filename.replace('.json', '.csv') + csv_path = output_path / csv_filename + + # 重置索引以便模型名称也作为列保存 + df_for_csv = df.reset_index() + df_for_csv.to_csv(csv_path, index=False, encoding='utf-8') + + # 保存Excel格式(如果需要) + excel_filename = summary_filename.replace('.json', '.xlsx') + excel_path = output_path / excel_filename + + try: + # 创建Excel文件,包含多个工作表 + with pd.ExcelWriter(excel_path, engine='openpyxl') as writer: + # 主要结果表 + df_for_csv.to_excel(writer, sheet_name='Summary', index=False) + + # 如果有多个模型,创建排名表 + if len(all_results) > 1: + ranking_df = create_ranking_dataframe(df) + ranking_df.to_excel(writer, sheet_name='Rankings', index=False) + + except ImportError: + logging.warning("openpyxl not installed, skipping Excel export") + + logging.info(f"Summary saved to {summary_path}") + logging.info(f"CSV summary saved to {csv_path}") + +def create_ranking_dataframe(df: pd.DataFrame) -> pd.DataFrame: + """ + 创建模型排名DataFrame + + Args: + df: 原始结果DataFrame + + Returns: + 包含排名的DataFrame + """ + # 排除非指标列 + metric_columns = [col for col in df.columns if col != "Data Count"] + + # 为每个指标创建排名(假设数值越大越好,可以根据需要调整) + ranking_data = [] + + for metric in metric_columns: + # 创建排名(降序,数值越大排名越前) + ranks = df[metric].rank(method='min', ascending=False) + + for model_name in df.index: + ranking_data.append({ + 'Model': model_name, + 'Metric': metric, + 'Value': df.loc[model_name, metric], + 'Rank': int(ranks[model_name]) + }) + + ranking_df = pd.DataFrame(ranking_data) + return ranking_df + +def print_summary(all_results: Dict[str, Dict]) -> None: + """ + 打印所有模型的汇总结果 + + Args: + all_results: 所有模型的结果字典 + """ + print("\n" + "="*100) + print("SUMMARY - ALL MODELS COMPARISON") + print("="*100) + + if not all_results: + print("No results to display") + return + + # 创建DataFrame + df = create_results_dataframe(all_results) + + if df.empty: + print("No valid results to display") + return + + # 使用tabulate打印美观的表格 + print(tabulate( + df, + headers=df.columns, + tablefmt='grid', + floatfmt='.4f', + showindex=True + )) + + # 如果有多个模型,显示最佳模型 + if len(all_results) > 1: + print("\n" + "-"*100) + print("BEST PERFORMERS BY METRIC:") + print("-"*100) + + metric_columns = [col for col in df.columns if col != "Data Count"] + + for metric in metric_columns: + best_model = df[metric].idxmax() + best_value = df.loc[best_model, metric] + print(f"{metric.upper():<20}: {best_model:<30} ({best_value:.4f})") + + print("="*100) + +def setup_logging(level: str = "INFO", format_str: str = None, log_dir: str = "logs") -> None: + """ + 设置日志配置 + + Args: + level: 日志级别 + format_str: 日志格式 + log_dir: 日志目录 + """ + if format_str is None: + format_str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" + + # 创建日志目录 + Path(log_dir).mkdir(parents=True, exist_ok=True) + + # 生成日志文件名(包含时间戳) + timestamp = datetime.now().strftime("%Y%m%d_%H%M") + log_file = Path(log_dir) / f"evaluation_{timestamp}.log" + + logging.basicConfig( + level=getattr(logging, level.upper()), + format=format_str, + handlers=[ + logging.StreamHandler(), + logging.FileHandler(log_file, encoding='utf-8') + ] + ) + +def print_metrics(metrics: Dict[str, float], model_name: str = None) -> None: + """ + 打印评估指标 + + Args: + metrics: 指标字典 + model_name: 模型名称 + """ + title = f"EVALUATION RESULTS - {model_name}" if model_name else "EVALUATION RESULTS" + print("\n" + "="*60) + print(title) + print("="*60) + + # 创建单行DataFrame用于美观显示 + df = pd.DataFrame([metrics]) + print(tabulate( + df, + headers=df.columns, + tablefmt='grid', + floatfmt='.4f', + showindex=False + )) + + print("="*60) diff --git a/layer1/ALL-merge/eval.py b/layer1/ALL-merge/eval.py index f914e59..e69de29 100644 --- a/layer1/ALL-merge/eval.py +++ b/layer1/ALL-merge/eval.py @@ -1,166 +0,0 @@ -import json -import threading -from tqdm import tqdm -import concurrent.futures -from openai import OpenAI -import numpy as np -from sklearn.metrics import precision_score, recall_score, f1_score -import re - -client = OpenAI( - api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d", - base_url="https://vip.apiyi.com/v1" -) - -thread_lock = threading.Lock() - -def load_json_data(filepath): - with open(filepath, 'r') as file: - data = json.load(file) - return data - -def get_response(input,max_retries=10): - retries = 0 - while retries