diff --git a/logs/evaluation_20250528_1530.log b/logs/evaluation_20250528_1530.log deleted file mode 100644 index 33b6c49..0000000 --- a/logs/evaluation_20250528_1530.log +++ /dev/null @@ -1,40 +0,0 @@ -2025-05-28 15:30:36,536 - __main__ - INFO - Starting multi-model evaluation framework -2025-05-28 15:30:36,536 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o'] -2025-05-28 15:30:36,543 - __main__ - INFO - Output directory: results/20250528_1530 -2025-05-28 15:30:36,543 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:30:36,568 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:30:36,569 - src.data_loader - INFO - Validated 3023 out of 3023 items -2025-05-28 15:30:36,569 - __main__ - INFO - Loaded 3023 valid data items -2025-05-28 15:30:36,569 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25 -2025-05-28 15:30:36,569 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25 -2025-05-28 15:30:36,595 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:30:38,447 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,461 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,485 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,499 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,549 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,613 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:38,630 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:39,998 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:40,267 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:40,287 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:30:40,288 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:30:40,302 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1530/qwen-max-2025-01-25.json -2025-05-28 15:30:40,302 - __main__ - INFO - Evaluating model 2/2: gpt-4o -2025-05-28 15:30:40,302 - __main__ - INFO - Starting evaluation for model: gpt-4o -2025-05-28 15:30:40,352 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:30:41,778 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:41,794 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:41,826 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,016 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,026 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,040 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,041 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,076 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,295 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:30:42,323 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:30:42,323 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:30:42,333 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1530/gpt-4o.json -2025-05-28 15:30:42,333 - __main__ - ERROR - Evaluation failed: 'summary_filename' diff --git a/logs/evaluation_20250528_1531.log b/logs/evaluation_20250528_1531.log deleted file mode 100644 index 2e34b71..0000000 --- a/logs/evaluation_20250528_1531.log +++ /dev/null @@ -1,41 +0,0 @@ -2025-05-28 15:31:25,896 - __main__ - INFO - Starting multi-model evaluation framework -2025-05-28 15:31:25,896 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o'] -2025-05-28 15:31:25,899 - __main__ - INFO - Output directory: results/20250528_1531 -2025-05-28 15:31:25,899 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:31:25,925 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:31:25,927 - src.data_loader - INFO - Validated 3023 out of 3023 items -2025-05-28 15:31:25,927 - __main__ - INFO - Loaded 3023 valid data items -2025-05-28 15:31:25,927 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25 -2025-05-28 15:31:25,927 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25 -2025-05-28 15:31:25,952 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:31:28,342 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,434 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,444 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,538 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:28,703 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:30,085 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:30,353 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:30,374 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:31:30,374 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:31:30,387 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1531/qwen-max-2025-01-25.json -2025-05-28 15:31:30,387 - __main__ - INFO - Evaluating model 2/2: gpt-4o -2025-05-28 15:31:30,387 - __main__ - INFO - Starting evaluation for model: gpt-4o -2025-05-28 15:31:30,436 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:31:31,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:31,886 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,119 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,139 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,140 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,162 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,449 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:32,539 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:38,330 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:31:38,351 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:31:38,351 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:31:38,366 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1531/gpt-4o.json -2025-05-28 15:31:38,372 - __main__ - INFO - Summary saved to results/20250528_1531/summary.json -2025-05-28 15:31:38,372 - __main__ - INFO - Multi-model evaluation completed successfully diff --git a/logs/evaluation_20250528_1535.log b/logs/evaluation_20250528_1535.log deleted file mode 100644 index 54d651d..0000000 --- a/logs/evaluation_20250528_1535.log +++ /dev/null @@ -1,44 +0,0 @@ -2025-05-28 15:35:59,778 - __main__ - INFO - Starting multi-model evaluation framework -2025-05-28 15:35:59,779 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o'] -2025-05-28 15:35:59,782 - __main__ - INFO - Output directory: results/20250528_1535 -2025-05-28 15:35:59,782 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:35:59,808 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json -2025-05-28 15:35:59,809 - src.data_loader - INFO - Validated 3023 out of 3023 items -2025-05-28 15:35:59,809 - __main__ - INFO - Loaded 3023 valid data items -2025-05-28 15:35:59,809 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25 -2025-05-28 15:35:59,809 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25 -2025-05-28 15:35:59,835 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:36:01,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,780 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,809 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,853 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,876 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:01,910 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:02,847 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:02,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:03,432 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:03,454 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:36:03,454 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:36:03,477 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1535/qwen-max-2025-01-25.json -2025-05-28 15:36:03,480 - __main__ - INFO - Evaluating model 2/2: gpt-4o -2025-05-28 15:36:03,481 - __main__ - INFO - Starting evaluation for model: gpt-4o -2025-05-28 15:36:03,534 - src.evaluator - INFO - Starting evaluation with 8 workers -2025-05-28 15:36:04,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,895 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,930 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:05,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:05,495 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK" -2025-05-28 15:36:05,514 - src.metrics - INFO - Metrics computed successfully -2025-05-28 15:36:05,515 - src.evaluator - INFO - Evaluation completed successfully -2025-05-28 15:36:05,532 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1535/gpt-4o.json -2025-05-28 15:36:05,564 - root - WARNING - openpyxl not installed, skipping Excel export -2025-05-28 15:36:05,564 - root - INFO - Summary saved to results/20250528_1535/summary.json -2025-05-28 15:36:05,564 - root - INFO - CSV summary saved to results/20250528_1535/summary.csv -2025-05-28 15:36:05,568 - __main__ - INFO - Summary saved to results/20250528_1535/summary.json -2025-05-28 15:36:05,568 - __main__ - INFO - Multi-model evaluation completed successfully diff --git a/results/20250528_1530/gpt-4o.json b/results/20250528_1530/gpt-4o.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1530/gpt-4o.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1530/gpt-4o_metrics.json b/results/20250528_1530/gpt-4o_metrics.json deleted file mode 100644 index 8d6d171..0000000 --- a/results/20250528_1530/gpt-4o_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:30:42.329641", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1530/qwen-max-2025-01-25.json b/results/20250528_1530/qwen-max-2025-01-25.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1530/qwen-max-2025-01-25.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1530/qwen-max-2025-01-25_metrics.json b/results/20250528_1530/qwen-max-2025-01-25_metrics.json deleted file mode 100644 index 00a04d8..0000000 --- a/results/20250528_1530/qwen-max-2025-01-25_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:30:40.296801", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1531/gpt-4o.json b/results/20250528_1531/gpt-4o.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1531/gpt-4o.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1531/gpt-4o_metrics.json b/results/20250528_1531/gpt-4o_metrics.json deleted file mode 100644 index 2d9eadb..0000000 --- a/results/20250528_1531/gpt-4o_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:31:38.361064", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1531/qwen-max-2025-01-25.json b/results/20250528_1531/qwen-max-2025-01-25.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1531/qwen-max-2025-01-25.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1531/qwen-max-2025-01-25_metrics.json b/results/20250528_1531/qwen-max-2025-01-25_metrics.json deleted file mode 100644 index cc49ec9..0000000 --- a/results/20250528_1531/qwen-max-2025-01-25_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:31:30.382105", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1531/summary.json b/results/20250528_1531/summary.json deleted file mode 100644 index c40d636..0000000 --- a/results/20250528_1531/summary.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "timestamp": "2025-05-28T15:31:38.366535", - "models_count": 2, - "models": { - "qwen-max-2025-01-25": { - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - }, - "data_count": 10 - }, - "gpt-4o": { - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - }, - "data_count": 10 - } - }, - "comparison": { - "accuracy": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "precision_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "recall_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "f1_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "precision_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "recall_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "f1_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - } - } -} \ No newline at end of file diff --git a/results/20250528_1535/gpt-4o.json b/results/20250528_1535/gpt-4o.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1535/gpt-4o.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1535/gpt-4o_metrics.json b/results/20250528_1535/gpt-4o_metrics.json deleted file mode 100644 index c21653f..0000000 --- a/results/20250528_1535/gpt-4o_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:36:05.524328", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1535/qwen-max-2025-01-25.json b/results/20250528_1535/qwen-max-2025-01-25.json deleted file mode 100644 index b6d5d9a..0000000 --- a/results/20250528_1535/qwen-max-2025-01-25.json +++ /dev/null @@ -1,202 +0,0 @@ -[ - { - "index": 0, - "question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?", - "choices": { - "text": [ - "the atom", - "the electron", - "the nucleus", - "the proton" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 1, - "question": "Which statement correctly describes a property of a type of matter?", - "choices": { - "text": [ - "Air is a mixture of gases.", - "Ice is a mixture of gases.", - "Air is a liquid.", - "Ice is a liquid." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 2, - "question": "Which statement best explains why a tree branch floats on water?", - "choices": { - "text": [ - "Wood is porous.", - "Wood is buoyant.", - "Wood is light.", - "Wood is magnetic." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 3, - "question": "The best way to separate salt from water is with the use of", - "choices": { - "text": [ - "oil.", - "heat.", - "a magnet.", - "rubbing alcohol." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 4, - "question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?", - "choices": { - "text": [ - "the frequency of the wave", - "the wavelength of the wave", - "the source that created the sound", - "the distance between molecules in the medium" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]D[/ANSWER]", - "llm_answer": "[ANSWER]D[/ANSWER]" - }, - { - "index": 5, - "question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?", - "choices": { - "text": [ - "W is the softest of the four substances tested.", - "W is the hardest of the four substances tested.", - "W can scratch Y.", - "W can scratch X." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]A[/ANSWER]", - "llm_answer": "[ANSWER]A[/ANSWER]" - }, - { - "index": 6, - "question": "When the temperature of a sample of 25 water is -5°C, the water is", - "choices": { - "text": [ - "a gas.", - "a liquid.", - "a solid.", - "a vapor." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 7, - "question": "Which is most useful to a student who is separating aluminum screws from steel screws?", - "choices": { - "text": [ - "a large funnel", - "a screen filter", - "a horseshoe magnet", - "a magnifying glass" - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]C[/ANSWER]", - "llm_answer": "[ANSWER]C[/ANSWER]" - }, - { - "index": 8, - "question": "How are sedimentary rocks made?", - "choices": { - "text": [ - "Magma or lava is cooled.", - "Materials are pressed together.", - "Chemical reactions change minerals.", - "Earthquakes cause small pieces to fall." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - }, - { - "index": 9, - "question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?", - "choices": { - "text": [ - "The ball makes light.", - "The ball reflects light.", - "The ball absorbs light and then releases it.", - "The ball absorbs light and keeps it inside." - ], - "label": [ - "A", - "B", - "C", - "D" - ] - }, - "answer": "[ANSWER]B[/ANSWER]", - "llm_answer": "[ANSWER]B[/ANSWER]" - } -] \ No newline at end of file diff --git a/results/20250528_1535/qwen-max-2025-01-25_metrics.json b/results/20250528_1535/qwen-max-2025-01-25_metrics.json deleted file mode 100644 index f706817..0000000 --- a/results/20250528_1535/qwen-max-2025-01-25_metrics.json +++ /dev/null @@ -1,12 +0,0 @@ -{ - "timestamp": "2025-05-28T15:36:03.466534", - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - } -} \ No newline at end of file diff --git a/results/20250528_1535/summary.csv b/results/20250528_1535/summary.csv deleted file mode 100644 index 661edb4..0000000 --- a/results/20250528_1535/summary.csv +++ /dev/null @@ -1,3 +0,0 @@ -Model,accuracy,precision_micro,recall_micro,f1_micro,precision_macro,recall_macro,f1_macro,Data Count -qwen-max-2025-01-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 -gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10 diff --git a/results/20250528_1535/summary.json b/results/20250528_1535/summary.json deleted file mode 100644 index 5520f4a..0000000 --- a/results/20250528_1535/summary.json +++ /dev/null @@ -1,60 +0,0 @@ -{ - "timestamp": "2025-05-28T15:36:05.540751", - "models_count": 2, - "models": { - "qwen-max-2025-01-25": { - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - }, - "data_count": 10 - }, - "gpt-4o": { - "metrics": { - "accuracy": 1.0, - "precision_micro": 1.0, - "recall_micro": 1.0, - "f1_micro": 1.0, - "precision_macro": 1.0, - "recall_macro": 1.0, - "f1_macro": 1.0 - }, - "data_count": 10 - } - }, - "comparison": { - "accuracy": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "precision_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "recall_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "f1_micro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "precision_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "recall_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - }, - "f1_macro": { - "qwen-max-2025-01-25": 1.0, - "gpt-4o": 1.0 - } - } -} \ No newline at end of file