Files
MatBench/results/20250602_1706/summary.json

98 lines
3.0 KiB
JSON

{
"timestamp": "2025-06-02T17:15:23.737185",
"models_count": 4,
"models": {
"qwen-max-2025-01-25": {
"metrics": {
"accuracy": 0.6446700507614214,
"precision_micro": 0.6336633663366337,
"recall_micro": 0.649746192893401,
"f1_micro": 0.6416040100250626,
"precision_macro": 0.6388760049474336,
"recall_macro": 0.6501020408163265,
"f1_macro": 0.64232342205538
},
"data_count": 197
},
"gpt-4o": {
"metrics": {
"accuracy": 0.5482233502538071,
"precision_micro": 0.5618556701030928,
"recall_micro": 0.5532994923857868,
"f1_micro": 0.5575447570332481,
"precision_macro": 0.5779088050314465,
"recall_macro": 0.5536734693877551,
"f1_macro": 0.5600088997453159
},
"data_count": 197
},
"deepseek-chat": {
"metrics": {
"accuracy": 0.6700507614213198,
"precision_micro": 0.676923076923077,
"recall_micro": 0.6700507614213198,
"f1_micro": 0.673469387755102,
"precision_macro": 0.6899114693446089,
"recall_macro": 0.6705102040816326,
"f1_macro": 0.6754210676562946
},
"data_count": 197
},
"claude-sonnet-4-20250514": {
"metrics": {
"accuracy": 0.700507614213198,
"precision_micro": 0.6934673366834171,
"recall_micro": 0.700507614213198,
"f1_micro": 0.696969696969697,
"precision_macro": 0.7072180484244438,
"recall_macro": 0.7009183673469388,
"f1_macro": 0.69833034513671
},
"data_count": 197
}
},
"comparison": {
"accuracy": {
"qwen-max-2025-01-25": 0.6446700507614214,
"gpt-4o": 0.5482233502538071,
"deepseek-chat": 0.6700507614213198,
"claude-sonnet-4-20250514": 0.700507614213198
},
"precision_micro": {
"qwen-max-2025-01-25": 0.6336633663366337,
"gpt-4o": 0.5618556701030928,
"deepseek-chat": 0.676923076923077,
"claude-sonnet-4-20250514": 0.6934673366834171
},
"recall_micro": {
"qwen-max-2025-01-25": 0.649746192893401,
"gpt-4o": 0.5532994923857868,
"deepseek-chat": 0.6700507614213198,
"claude-sonnet-4-20250514": 0.700507614213198
},
"f1_micro": {
"qwen-max-2025-01-25": 0.6416040100250626,
"gpt-4o": 0.5575447570332481,
"deepseek-chat": 0.673469387755102,
"claude-sonnet-4-20250514": 0.696969696969697
},
"precision_macro": {
"qwen-max-2025-01-25": 0.6388760049474336,
"gpt-4o": 0.5779088050314465,
"deepseek-chat": 0.6899114693446089,
"claude-sonnet-4-20250514": 0.7072180484244438
},
"recall_macro": {
"qwen-max-2025-01-25": 0.6501020408163265,
"gpt-4o": 0.5536734693877551,
"deepseek-chat": 0.6705102040816326,
"claude-sonnet-4-20250514": 0.7009183673469388
},
"f1_macro": {
"qwen-max-2025-01-25": 0.64232342205538,
"gpt-4o": 0.5600088997453159,
"deepseek-chat": 0.6754210676562946,
"claude-sonnet-4-20250514": 0.69833034513671
}
}
}