Compare commits

...

7 Commits

48 changed files with 309870 additions and 1545 deletions

View File

@@ -2,22 +2,25 @@
api: api:
key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d" key: "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
base_url: "https://vip.apiyi.com/v1" base_url: "https://vip.apiyi.com/v1"
temperature: 0 temperature: 0 # 默认使用模型的温度设置
max_retries: 10 max_retries: 10
# 支持多个模型 # 支持多个模型
models: models:
- "qwen-max-2025-01-25" - "qwen-max-2025-01-25"
- "gpt-4o" - "gpt-4o"
- "deepseek-chat"
- "claude-sonnet-4-20250514"
# 或者使用单个模型(向后兼容) # 或者使用单个模型(向后兼容)
# model: "qwen-max-2025-01-25" # model: "qwen-max-2025-01-25"
# 系统提示词 system_prompt: None
system_prompt: "You are an expert in the field of materials science, adept at answering questions related to fundamental aspects of materials science, including material structure, properties, processing, and applications."
# 评估配置 # 评估配置
evaluation: evaluation:
max_workers: 8 max_workers: 20
input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json" # input_file: "/home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json"
# input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions.json"
input_file: "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json"
# 输出配置 # 输出配置
output: output:
base_dir: "results" base_dir: "results"

View File

@@ -144,7 +144,7 @@ def main():
logger.info(f"Evaluating model {i}/{len(models)}: {model_name}") logger.info(f"Evaluating model {i}/{len(models)}: {model_name}")
try: try:
model_result = evaluate_single_model(model_name, data[:10], config, output_dir) model_result = evaluate_single_model(model_name, data, config, output_dir)
all_results[model_name] = model_result all_results[model_name] = model_result
# 打印当前模型的结果 # 打印当前模型的结果

View File

@@ -51,7 +51,7 @@ class Evaluator:
# 格式化选择项 # 格式化选择项
formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)]) formatted_choices = " ".join([f"({lbl}) {txt}" for lbl, txt in zip(label, text)])
user_input = f"{question} {formatted_choices}. {prompt}" user_input = f"{prompt} \n {question} {formatted_choices}"
# 获取LLM响应 # 获取LLM响应
llm_answer = self.llm_client.get_response(user_input, self.system_prompt) llm_answer = self.llm_client.get_response(user_input, self.system_prompt)

View File

@@ -48,14 +48,27 @@ class LLMClient:
retries = 0 retries = 0
while retries < self.max_retries: while retries < self.max_retries:
try: try:
response = self.client.chat.completions.create( if system_prompt == 'None':
model=self.model, messages = [
messages=[ {"role": "user", "content": user_input}
]
else:
messages = [
{"role": "system", "content": system_prompt}, {"role": "system", "content": system_prompt},
{"role": "user", "content": user_input} {"role": "user", "content": user_input}
], ]
temperature=self.temperature
) if self.temperature == -1:
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
)
else:
response = self.client.chat.completions.create(
model=self.model,
messages=messages,
temperature=self.temperature
)
answer = response.choices[0].message.content answer = response.choices[0].message.content
return answer return answer

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,976 @@
import json
import openai
from typing import Dict, Any, List, Tuple, Optional
import time
import logging
from concurrent.futures import ThreadPoolExecutor, as_completed
import threading
from tqdm import tqdm
import random
import re
# 配置日志
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
class ChoiceOptionsGenerator:
def __init__(self, api_key: str, base_url: str, model_name: str, max_workers: int = 20):
self.api_key = api_key
self.base_url = base_url
self.model_name = model_name
self.max_workers = max_workers
self.thread_local = threading.local()
self.lock = threading.Lock()
self.max_retries = 5
self.max_sampling_attempts = 6
def get_client(self):
if not hasattr(self.thread_local, 'client'):
self.thread_local.client = openai.OpenAI(
api_key=self.api_key,
base_url=self.base_url
)
return self.thread_local.client
def generate_options_with_sampling(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
"""使用多次采样策略生成选项"""
attempts_results = []
for attempt in range(self.max_sampling_attempts):
try:
# 生成一个候选选项
candidate = self._attempt_generate_options(question_data)
if not self._validate_options_quality(candidate, question_data):
with self.lock:
logging.warning(f"{attempt+1}次采样 - 选项质量验证失败")
continue
# 测试模型是否能正确回答这个问题
is_model_correct = self._test_model_performance(candidate, question_data)
candidate["performance_test"] = {
"model_answered_correctly": is_model_correct,
"sampling_attempt": attempt + 1,
}
attempts_results.append(candidate)
with self.lock:
logging.info(f"{attempt+1}次采样 - 模型{'答对' if is_model_correct else '答错'}")
# 如果模型答错了,这是一个好的困难题目,早停
if not is_model_correct:
return self._finalize_result(candidate, attempts_results, "early_stop_incorrect")
except Exception as e:
with self.lock:
logging.warning(f"{attempt+1}次采样失败: {e}")
continue
# 所有采样都完成了,选择一个结果
if attempts_results:
# 检查是否所有采样都答对了
all_correct = all(r.get("performance_test", {}).get("model_answered_correctly", True)
for r in attempts_results)
if all_correct:
selected = random.choice(attempts_results)
return self._finalize_result(selected, attempts_results, "all_samples_correct")
else:
# 优先选择答错的
incorrect_results = [r for r in attempts_results
if not r.get("performance_test", {}).get("model_answered_correctly", True)]
if incorrect_results:
selected = random.choice(incorrect_results)
return self._finalize_result(selected, attempts_results, "mixed_results")
else:
selected = random.choice(attempts_results)
return self._finalize_result(selected, attempts_results, "mixed_results")
# 所有采样都失败
logging.error("所有采样都失败,使用备用选项")
return self._create_fallback_options(question_data)
def _finalize_result(self, selected_result: Dict[str, Any], all_results: List[Dict], result_type: str) -> Dict[str, Any]:
"""完善最终结果的标记信息"""
# 统计所有采样的结果
total_attempts = len(all_results)
correct_count = sum(1 for r in all_results
if r.get("performance_test", {}).get("model_answered_correctly", True))
incorrect_count = total_attempts - correct_count
# 添加汇总信息
selected_result["sampling_summary"] = {
"result_type": result_type, # early_stop_incorrect, all_samples_correct, mixed_results
"total_sampling_attempts": total_attempts,
"correct_answers": correct_count,
"incorrect_answers": incorrect_count,
"is_early_stop": result_type == "early_stop_incorrect",
"is_all_correct": result_type == "all_samples_correct",
"selected_attempt": selected_result.get("performance_test", {}).get("sampling_attempt", 1),
"selected_was_correct": selected_result.get("performance_test", {}).get("model_answered_correctly", True)
}
# 简化的难度标记
if result_type == "early_stop_incorrect":
difficulty_label = "hard_early_stop"
elif result_type == "all_samples_correct":
difficulty_label = "easy_all_correct"
else:
difficulty_label = "mixed"
selected_result["sampling_summary"]["difficulty_label"] = difficulty_label
with self.lock:
logging.info(f"题目标记: {difficulty_label} (正确{correct_count}/{total_attempts}次)")
return selected_result
def _test_model_performance(self, generated_question: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
"""测试模型是否能正确回答生成的问题"""
try:
question_type = generated_question.get("question_type", "")
if question_type == "true_false":
return self._test_true_false_question(generated_question)
elif question_type == "multiple_choice":
return self._test_multiple_choice_question(generated_question, original_data)
else:
logging.warning(f"未知题目类型: {question_type}")
return True # 默认认为模型答对了
except Exception as e:
logging.error(f"测试模型性能时出错: {e}")
return True # 出错时默认认为模型答对了
def _test_true_false_question(self, question_data: Dict[str, Any]) -> bool:
"""测试判断题"""
statement = question_data.get("statement", "")
correct_answer = question_data.get("correct_answer", "")
if not statement or not correct_answer:
logging.warning("判断题数据不完整")
return True
test_prompt = f"""
请判断以下陈述的正误。请仔细分析每个细节,考虑所有可能的条件和例外情况。
陈述:{statement}
请只输出 "True""False",不要解释:
"""
try:
client = self.get_client()
response = client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "你是一个材料科学专家。请仔细分析陈述考虑所有技术细节和特殊情况只输出True或False。"},
{"role": "user", "content": test_prompt}
],
temperature=0.1,
max_tokens=10
)
model_answer = response.choices[0].message.content.strip()
if "True" in model_answer:
model_answer = "True"
elif "False" in model_answer:
model_answer = "False"
else:
logging.warning(f"模型回答格式异常: {model_answer}")
return True # 格式异常默认认为答对
is_correct = model_answer == correct_answer
logging.debug(f"判断题测试 - 正确答案: {correct_answer}, 模型答案: {model_answer}, 结果: {'正确' if is_correct else '错误'}")
return is_correct
except Exception as e:
logging.error(f"测试判断题时出错: {e}")
return True
def _test_multiple_choice_question(self, question_data: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
"""测试选择题"""
options = question_data.get("options", {})
correct_answer = question_data.get("correct_answer", "")
original_question = original_data.get("choice_question", "")
if not options or not correct_answer or not original_question:
logging.warning("选择题数据不完整")
return True
# 构造完整的选择题
options_text = ""
for key in sorted(options.keys()):
options_text += f"{key}. {options[key]}\n"
test_prompt = f"""
以下是一道材料科学专业题目,请仔细分析每个选项,考虑所有技术细节和约束条件。
题目:{original_question}
选项:
{options_text}
请选择最准确的答案只输出选项字母A、B、C或D
"""
try:
client = self.get_client()
response = client.chat.completions.create(
model=self.model_name,
messages=[
{"role": "system", "content": "你是一个材料科学专家。请深入分析题目,仔细比较各选项的技术准确性,只输出选项字母。"},
{"role": "user", "content": test_prompt}
],
temperature=0.1,
max_tokens=10
)
model_answer = response.choices[0].message.content.strip().upper()
model_choice = ""
for char in model_answer:
if char in ["A", "B", "C", "D"]:
model_choice = char
break
if not model_choice:
logging.warning(f"模型回答格式异常: {model_answer}")
return True # 格式异常默认认为答对
is_correct = model_choice == correct_answer.upper()
logging.debug(f"选择题测试 - 正确答案: {correct_answer}, 模型答案: {model_choice}, 结果: {'正确' if is_correct else '错误'}")
return is_correct
except Exception as e:
logging.error(f"测试选择题时出错: {e}")
return True
def _create_fallback_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
"""当AI生成失败时的备用选项生成"""
question_type = question_data.get("question_type", "")
correct_option = question_data.get("correct_option", "")
if question_type == "true_false":
return {
"question_type": "true_false",
"statement": question_data.get("choice_question", ""),
"options": ["True", "False"],
"correct_answer": self._determine_true_false_answer(correct_option),
"explanation": "基于题目分析的判断结果",
"sampling_summary": {
"result_type": "fallback",
"difficulty_label": "unknown_fallback",
"total_sampling_attempts": 0,
"is_early_stop": False,
"is_all_correct": False
}
}
else:
distractors = ["选项B", "选项C", "选项D"]
all_options = [correct_option] + distractors
random.shuffle(all_options)
correct_index = all_options.index(correct_option)
correct_label = ["A", "B", "C", "D"][correct_index]
return {
"question_type": "multiple_choice",
"options": {
"A": all_options[0],
"B": all_options[1],
"C": all_options[2],
"D": all_options[3]
},
"correct_answer": correct_label,
"explanation": "基于规则生成的备用选项",
"sampling_summary": {
"result_type": "fallback",
"difficulty_label": "unknown_fallback",
"total_sampling_attempts": 0,
"is_early_stop": False,
"is_all_correct": False
}
}
def _determine_true_false_answer(self, correct_option: str) -> str:
"""确定判断题的正确答案"""
true_indicators = ["true", "正确", "", "", "T", "", "yes"]
false_indicators = ["false", "错误", "", "", "F", "×", "no"]
correct_lower = correct_option.lower().strip()
if any(indicator in correct_lower for indicator in true_indicators):
return "True"
elif any(indicator in correct_lower for indicator in false_indicators):
return "False"
else:
return "True"
# 提示词方法保持不变
def _create_enhanced_multiple_choice_prompt(self, question: str, correct_option: str, original_question: str, question_type: str) -> str:
"""增强的选择题提示词 - 专注于生成更具迷惑性的选项"""
return f"""
你是材料科学领域的顶级专家专门设计能够挑战AI系统的高难度考试题目。你的目标是创建连GPT-4、Claude等先进AI都可能答错的题目。
题目: {question}
正确答案: {correct_option}
原始题目: {original_question}
题目类型: {question_type}
**核心策略让AI模型产生错误的直觉判断**
**超高难度干扰项设计原则:**
1. **认知偏差利用型干扰项**
- 利用常见的材料科学概念混淆如弹性模量vs剪切模量
- 设计基于"第一印象"正确但深入分析错误的选项
- 使用数值接近但单位或条件不同的陷阱
2. **专业直觉陷阱型干扰项**
- 基于材料科学中"看似显而易见"但实际错误的推理
- 利用不同材料体系间的类比误区
- 设计需要区分相似工艺或现象的细微差别
3. **多层次验证陷阱**
- 创建需要同时考虑多个材料参数的复杂选项
- 设计表面符合某个理论但忽略关键约束条件的选项
- 使用真实存在但在当前情境下不适用的材料数据
输出格式严格JSON无注释
{{
"question_type": "multiple_choice",
"options": {{
"A": "选项A内容",
"B": "选项B内容",
"C": "选项C内容",
"D": "选项D内容"
}},
"correct_answer": "A",
"explanation": "详细解释为什么正确答案正确,以及每个干扰项的具体迷惑策略"
}}
现在请基于以上原则设计一道能让先进AI模型都可能答错的高难度选择题。
"""
def _create_enhanced_true_false_prompt(self, question: str, correct_option: str, original_question: str) -> str:
"""增强的判断题提示词"""
return f"""
你是材料科学专家需要设计能够挑战AI判断能力的高难度判断题。
题目: {question}
正确答案: {correct_option}
原始题目: {original_question}
**设计高难度判断题的策略:**
1. **微妙条件陷阱**:设计在特定条件下成立但一般情况下错误(或相反)的陈述
2. **精确性陷阱**:使用"总是""从不""所有"等绝对词汇的微妙误用
3. **概念边界模糊**:涉及材料科学中定义边界模糊的概念
4. **数值精度陷阱**:涉及需要精确数值判断的陈述
输出格式严格JSON无注释
{{
"question_type": "true_false",
"statement": "需要判断的复杂陈述句",
"options": ["True", "False"],
"correct_answer": "True或False",
"explanation": "详细解释判断理由和可能的误解点"
}}
"""
def create_options_prompt(self, question_data: Dict[str, Any]) -> str:
"""创建生成选项的提示词"""
choice_question = question_data.get("choice_question", "")
correct_option = question_data.get("correct_option", "")
original_question = question_data.get("question", "")
question_type = question_data.get("question_type", "")
if question_type == "true_false":
return self._create_enhanced_true_false_prompt(choice_question, correct_option, original_question)
else:
return self._create_enhanced_multiple_choice_prompt(choice_question, correct_option, original_question, question_type)
def _attempt_generate_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
"""单次尝试生成选项"""
client = self.get_client()
prompt = self.create_options_prompt(question_data)
response = client.chat.completions.create(
model=self.model_name,
messages=[
{
"role": "system",
"content": "你是一个材料科学专业的教育评估专家。请严格按照要求的JSON格式输出不要添加任何额外的文本、注释或代码块标记。确保输出的JSON语法完全正确。"
},
{"role": "user", "content": prompt}
],
temperature=0.9,
max_tokens=2000,
top_p=0.95
)
result_text = response.choices[0].message.content.strip()
logging.debug(f"AI响应: {result_text}")
json_result = self._extract_and_fix_json(result_text)
return json_result
def _extract_and_fix_json(self, response_text: str) -> Dict[str, Any]:
"""从响应文本中提取并修复JSON"""
response_text = re.sub(r'```json\s*', '', response_text)
response_text = re.sub(r'```\s*$', '', response_text)
json_start = response_text.find('{')
json_end = response_text.rfind('}') + 1
if json_start == -1 or json_end <= json_start:
raise ValueError("无法在响应中找到JSON格式内容")
json_str = response_text[json_start:json_end]
json_str = self._fix_json_syntax(json_str)
try:
return json.loads(json_str)
except json.JSONDecodeError as e:
logging.error(f"JSON解析失败: {e}")
json_str = self._aggressive_json_fix(json_str)
return json.loads(json_str)
def _fix_json_syntax(self, json_str: str) -> str:
"""修复常见的JSON语法错误"""
json_str = re.sub(r'//.*?(?=\n|$)', '', json_str)
json_str = re.sub(r'/\*.*?\*/', '', json_str, flags=re.DOTALL)
json_str = re.sub(r',\s*}', '}', json_str)
json_str = re.sub(r',\s*]', ']', json_str)
json_str = re.sub(r"'([^']*)':", r'"\1":', json_str)
json_str = re.sub(r":\s*'([^']*)'", r': "\1"', json_str)
return json_str
def _aggressive_json_fix(self, json_str: str) -> str:
"""更激进的JSON修复方法"""
try:
patterns = {
'question_type': r'"question_type"\s*:\s*"([^"]*)"',
'correct_answer': r'"correct_answer"\s*:\s*"([^"]*)"',
'explanation': r'"explanation"\s*:\s*"([^"]*)"'
}
extracted = {}
for key, pattern in patterns.items():
match = re.search(pattern, json_str)
if match:
extracted[key] = match.group(1)
options_match = re.search(r'"options"\s*:\s*{([^}]*)}', json_str)
if options_match:
options_content = options_match.group(1)
options = {}
option_pattern = r'"([ABCD])"\s*:\s*"([^"]*)"'
for match in re.finditer(option_pattern, options_content):
options[match.group(1)] = match.group(2)
extracted['options'] = options
if 'question_type' in extracted and len(extracted) >= 3:
return json.dumps(extracted, ensure_ascii=False)
except Exception as e:
logging.error(f"激进修复失败: {e}")
raise ValueError("无法修复JSON格式")
def _validate_options_quality(self, result: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
"""验证生成选项的质量"""
if not result:
return False
question_type = result.get("question_type", "")
if question_type == "true_false":
return self._validate_true_false_quality(result)
elif question_type == "multiple_choice":
return self._validate_multiple_choice_quality(result, original_data)
return False
def _validate_true_false_quality(self, result: Dict[str, Any]) -> bool:
"""验证判断题质量"""
required_fields = ["statement", "options", "correct_answer", "explanation"]
if not all(field in result for field in required_fields):
return False
options = result.get("options", [])
if not (len(options) == 2 and "True" in options and "False" in options):
return False
correct_answer = result.get("correct_answer", "")
if correct_answer not in ["True", "False"]:
return False
return True
def _validate_multiple_choice_quality(self, result: Dict[str, Any], original_data: Dict[str, Any]) -> bool:
"""验证选择题质量"""
if not all(key in result for key in ["options", "correct_answer", "explanation"]):
return False
options = result.get("options", {})
if len(options) != 4 or not all(label in options for label in ["A", "B", "C", "D"]):
return False
correct_answer = result.get("correct_answer", "")
if correct_answer not in ["A", "B", "C", "D"]:
return False
if any(len(str(option).strip()) < 2 for option in options.values()):
return False
option_values = [str(option).strip().lower() for option in options.values()]
if len(set(option_values)) != 4:
return False
return True
def generate_options(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
"""为单个题目生成选项,使用多次采样策略"""
result = self.generate_options_with_sampling(question_data)
if result:
return result
logging.warning("采样生成失败,回退到原始生成方法")
return self._generate_with_basic_retry(question_data)
def _generate_with_basic_retry(self, question_data: Dict[str, Any]) -> Dict[str, Any]:
"""基础重试生成方法"""
for attempt in range(self.max_retries):
try:
result = self._attempt_generate_options(question_data)
if self._validate_options_quality(result, question_data):
# 为基础重试添加采样信息
result["sampling_summary"] = {
"result_type": "basic_retry",
"difficulty_label": "unknown_retry",
"total_sampling_attempts": 1,
"is_early_stop": False,
"is_all_correct": False
}
return result
else:
if attempt < self.max_retries - 1:
logging.warning(f"{attempt+1}次生成的选项质量不佳,重试中...")
time.sleep(1)
continue
except Exception as e:
logging.error(f"{attempt+1}次生成选项失败: {e}")
if attempt < self.max_retries - 1:
time.sleep(2)
continue
logging.error("所有重试都失败,使用备用选项生成")
return self._create_fallback_options(question_data)
def process_single_question(generator, question, question_index):
"""处理单个题目的函数"""
try:
options_data = generator.generate_options(question)
complete_question = question.copy()
complete_question["generated_options"] = options_data
complete_question["generation_status"] = "success"
complete_question["question_index"] = question_index
# 提取采样信息用于日志
sampling_info = options_data.get("sampling_summary", {})
difficulty_label = sampling_info.get("difficulty_label", "unknown")
attempts = sampling_info.get("total_sampling_attempts", 1)
is_early_stop = sampling_info.get("is_early_stop", False)
is_all_correct = sampling_info.get("is_all_correct", False)
status_emoji = {
"hard_early_stop": "🔥",
"easy_all_correct": "",
"mixed": "",
"unknown_fallback": "",
"unknown_retry": "🔄"
}
logging.info(f"{question_index+1}题完成 - {difficulty_label} - 采样{attempts}次 - {'早停' if is_early_stop else '全采样'}")
return complete_question
except Exception as e:
logging.error(f"{question_index+1}题处理失败: {e}")
failed_question = question.copy()
failed_question["generated_options"] = generator._create_fallback_options(question)
failed_question["generation_status"] = "failed"
failed_question["error_message"] = str(e)
failed_question["question_index"] = question_index
return failed_question
def main():
# 配置信息
API_KEY = "sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d"
BASE_URL = "https://vip.apiyi.com/v1"
MODEL_NAME = "deepseek-chat"
MAX_WORKERS = 20
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/step7_no_perp_convertible.json"
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions_with_sampling.json"
# 加载数据
print("正在加载题目数据...")
with open(INPUT_FILE, 'r', encoding='utf-8') as f:
questions = json.load(f)
import random
random.shuffle(questions) # 随机打乱题目顺序
# questions = questions[:100] # 限制处理前100道题目以便测试
print(f"加载了 {len(questions)} 道题目")
# 统计题目类型分布
type_counts = {}
for q in questions:
qtype = q.get("question_type", "unknown")
type_counts[qtype] = type_counts.get(qtype, 0) + 1
print("题目类型分布:")
for qtype, count in type_counts.items():
print(f" {qtype}: {count}")
# 初始化生成器
generator = ChoiceOptionsGenerator(API_KEY, BASE_URL, MODEL_NAME, MAX_WORKERS)
print(f"\n开始生成选项,每题最多采样{generator.max_sampling_attempts}次...")
print("策略:答错题目会早停,答对题目会继续采样直到上限")
# 使用ThreadPoolExecutor进行并发处理
# 使用ThreadPoolExecutor进行并发处理
with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
# 提交所有任务
future_to_question = {
executor.submit(process_single_question, generator, question, i): (question, i)
for i, question in enumerate(questions)
}
# 使用tqdm显示进度
with tqdm(total=len(questions), desc="生成选项") as pbar:
# 收集结果
temp_results = []
for future in as_completed(future_to_question):
try:
result = future.result()
temp_results.append(result)
pbar.update(1)
# 更新进度条描述信息
sampling_info = result.get("generated_options", {}).get("sampling_summary", {})
difficulty_label = sampling_info.get("difficulty_label", "unknown")
status_emoji = {
"hard_early_stop": "🔥",
"easy_all_correct": "",
"mixed": "",
"unknown_fallback": "",
"unknown_retry": "🔄"
}
desc = f"生成选项 {status_emoji.get(difficulty_label, '')}"
pbar.set_description(desc)
except Exception as e:
logging.error(f"处理结果时发生错误: {e}")
original_question, question_index = future_to_question[future]
# 创建失败结果
failed_result = original_question.copy()
failed_result["generated_options"] = generator._create_fallback_options(original_question)
failed_result["generation_status"] = "processing_failed"
failed_result["error_message"] = str(e)
failed_result["question_index"] = question_index
temp_results.append(failed_result)
pbar.update(1)
# 按原始顺序排序结果
complete_questions = sorted(temp_results, key=lambda x: x.get("question_index", 0))
# 移除临时的索引字段
for question in complete_questions:
if "question_index" in question:
del question["question_index"]
# 统计采样结果
print("\n=== 采样结果统计 ===")
sampling_stats = {
"hard_early_stop": 0, # 答错后早停的困难题
"easy_all_correct": 0, # 全部采样都答对的简单题
"mixed": 0, # 混合结果
"unknown_fallback": 0, # 备用方案
"unknown_retry": 0, # 重试方案
"total": len(complete_questions)
}
early_stop_questions = []
all_correct_questions = []
total_api_calls = 0
total_generation_calls = 0
total_validation_calls = 0
for q in complete_questions:
options_data = q.get("generated_options", {})
sampling_info = options_data.get("sampling_summary", {})
difficulty_label = sampling_info.get("difficulty_label", "unknown_fallback")
is_early_stop = sampling_info.get("is_early_stop", False)
is_all_correct = sampling_info.get("is_all_correct", False)
attempts = sampling_info.get("total_sampling_attempts", 0)
# 统计标签分布
if difficulty_label in sampling_stats:
sampling_stats[difficulty_label] += 1
# 收集特殊类别的题目
if is_early_stop:
early_stop_questions.append(q)
if is_all_correct:
all_correct_questions.append(q)
# 统计API调用次数
total_generation_calls += attempts
# 每次采样都需要验证(除了备用方案)
if difficulty_label not in ["unknown_fallback", "unknown_retry"]:
total_validation_calls += attempts
total_api_calls = total_generation_calls + total_validation_calls
# 输出统计结果
print("题目标记分布:")
for label, count in sampling_stats.items():
if label != "total" and count > 0:
percentage = (count / sampling_stats["total"]) * 100
print(f" {label}: {count} 道 ({percentage:.1f}%)")
print(f"\n关键指标:")
print(f" 早停困难题(答错后早停): {len(early_stop_questions)}")
print(f" 全正确简单题(所有采样都答对): {len(all_correct_questions)}")
print(f" 早停率: {len(early_stop_questions)/len(complete_questions)*100:.1f}%")
print(f" 全正确率: {len(all_correct_questions)/len(complete_questions)*100:.1f}%")
# API成本统计
print(f"\n=== API调用统计 ===")
print(f"总生成调用: {total_generation_calls}")
print(f"总验证调用: {total_validation_calls}")
print(f"总API调用: {total_api_calls}")
print(f"平均每题调用: {total_api_calls/len(complete_questions):.1f}")
# 采样效率分析
if early_stop_questions:
early_stop_attempts = [q.get("generated_options", {}).get("sampling_summary", {}).get("total_sampling_attempts", 0)
for q in early_stop_questions]
avg_early_stop_attempts = sum(early_stop_attempts) / len(early_stop_attempts)
print(f"早停题目平均采样次数: {avg_early_stop_attempts:.1f}")
if all_correct_questions:
all_correct_attempts = [q.get("generated_options", {}).get("sampling_summary", {}).get("total_sampling_attempts", 0)
for q in all_correct_questions]
avg_all_correct_attempts = sum(all_correct_attempts) / len(all_correct_attempts)
print(f"全正确题目平均采样次数: {avg_all_correct_attempts:.1f}")
# 按题目类型分析
print(f"\n=== 各题型采样效果 ===")
type_sampling_analysis = {}
for q in complete_questions:
qtype = q.get("question_type", "unknown")
options_data = q.get("generated_options", {})
sampling_info = options_data.get("sampling_summary", {})
difficulty_label = sampling_info.get("difficulty_label", "unknown")
if qtype not in type_sampling_analysis:
type_sampling_analysis[qtype] = {
"hard_early_stop": 0,
"easy_all_correct": 0,
"mixed": 0,
"unknown": 0,
"total": 0
}
type_sampling_analysis[qtype]["total"] += 1
if difficulty_label == "hard_early_stop":
type_sampling_analysis[qtype]["hard_early_stop"] += 1
elif difficulty_label == "easy_all_correct":
type_sampling_analysis[qtype]["easy_all_correct"] += 1
elif difficulty_label == "mixed":
type_sampling_analysis[qtype]["mixed"] += 1
else:
type_sampling_analysis[qtype]["unknown"] += 1
for qtype, stats in type_sampling_analysis.items():
if stats["total"] > 0:
print(f"{qtype}:")
early_stop_rate = (stats["hard_early_stop"] / stats["total"]) * 100
all_correct_rate = (stats["easy_all_correct"] / stats["total"]) * 100
print(f" 早停率: {early_stop_rate:.1f}% ({stats['hard_early_stop']}/{stats['total']})")
print(f" 全正确率: {all_correct_rate:.1f}% ({stats['easy_all_correct']}/{stats['total']})")
# 保存结果
final_output = {
"questions": complete_questions,
"sampling_statistics": {
"label_distribution": {k: v for k, v in sampling_stats.items() if k != "total"},
"early_stop_count": len(early_stop_questions),
"all_correct_count": len(all_correct_questions),
"early_stop_rate": len(early_stop_questions)/len(complete_questions),
"all_correct_rate": len(all_correct_questions)/len(complete_questions),
"total_questions": len(complete_questions)
},
"api_usage": {
"total_generation_calls": total_generation_calls,
"total_validation_calls": total_validation_calls,
"total_api_calls": total_api_calls,
"average_calls_per_question": total_api_calls/len(complete_questions)
},
"generation_metadata": {
"generation_timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
"model_used": MODEL_NAME,
"max_sampling_attempts": generator.max_sampling_attempts,
"success_rate": sum(1 for q in complete_questions if q.get("generation_status") == "success") / len(complete_questions)
}
}
with open(OUTPUT_FILE, 'w', encoding='utf-8') as f:
json.dump(final_output, f, ensure_ascii=False, indent=2)
# 输出成功率统计
success_count = sum(1 for q in complete_questions if q.get("generation_status") == "success")
failed_count = len(complete_questions) - success_count
print(f"\n=== 生成成功率统计 ===")
print(f"总共处理: {len(complete_questions)} 道题目")
print(f"成功生成: {success_count}")
print(f"使用备用方案: {failed_count}")
print(f"成功率: {success_count/len(complete_questions)*100:.2f}%")
# 策略效果评估
print(f"\n=== 策略效果评估 ===")
if len(early_stop_questions) > 0:
print("✅ 早停策略有效:成功识别出困难题目")
print(f" 困难题目数量: {len(early_stop_questions)}")
# 展示几个早停题目的例子
print(" 早停题目示例:")
for i, q in enumerate(early_stop_questions[:3]): # 只显示前3个
qtype = q.get("question_type", "unknown")
attempts = q.get("generated_options", {}).get("sampling_summary", {}).get("total_sampling_attempts", 0)
print(f" {i+1}. {qtype}题,第{attempts}次采样后早停")
else:
print("⚠️ 没有题目触发早停,可能需要调整难度")
if len(all_correct_questions) > 0:
print("✅ 全采样策略有效:识别出简单题目")
print(f" 简单题目数量: {len(all_correct_questions)}")
# 展示几个全正确题目的例子
print(" 全正确题目示例:")
for i, q in enumerate(all_correct_questions[:3]): # 只显示前3个
qtype = q.get("question_type", "unknown")
attempts = q.get("generated_options", {}).get("sampling_summary", {}).get("total_sampling_attempts", 0)
print(f" {i+1}. {qtype}题,{attempts}次采样全部答对")
else:
print("⚠️ 没有题目全部答对,生成的题目可能都比较困难")
# 给出优化建议
print(f"\n=== 优化建议 ===")
early_stop_rate = len(early_stop_questions)/len(complete_questions)
all_correct_rate = len(all_correct_questions)/len(complete_questions)
if early_stop_rate < 0.2:
print("• 早停率偏低,建议:")
print(" - 增强提示词的迷惑性设计")
print(" - 提高选项生成的创造性增加temperature")
print(" - 添加更多AI容易犯错的陷阱类型")
if all_correct_rate > 0.6:
print("• 全正确率过高,建议:")
print(" - 检查题目是否过于简单")
print(" - 提升干扰选项的质量")
print(" - 增加专业深度和复杂性")
if early_stop_rate > 0.8:
print("• 早停率过高,建议:")
print(" - 适当降低题目难度")
print(" - 平衡难易程度分布")
print(" - 检查是否过度设计陷阱")
avg_api_calls = total_api_calls/len(complete_questions)
if avg_api_calls > 8:
print("• API调用次数偏高建议:")
print(" - 优化提示词提高首次生成质量")
print(" - 考虑减少最大采样次数")
print(" - 改进验证逻辑减少失败率")
print(f"\n结果已保存到: {OUTPUT_FILE}")
print("包含完整的题目数据、采样统计和API使用情况")
def export_analysis_report(questions: List[Dict], output_path: str):
"""导出分析报告"""
early_stop_questions = []
all_correct_questions = []
mixed_questions = []
for q in questions:
options_data = q.get("generated_options", {})
sampling_info = options_data.get("sampling_summary", {})
difficulty_label = sampling_info.get("difficulty_label", "unknown")
if difficulty_label == "hard_early_stop":
early_stop_questions.append(q)
elif difficulty_label == "easy_all_correct":
all_correct_questions.append(q)
elif difficulty_label == "mixed":
mixed_questions.append(q)
report = {
"summary": {
"total_questions": len(questions),
"early_stop_questions": len(early_stop_questions),
"all_correct_questions": len(all_correct_questions),
"mixed_questions": len(mixed_questions),
"early_stop_rate": len(early_stop_questions) / len(questions),
"all_correct_rate": len(all_correct_questions) / len(questions)
},
"early_stop_examples": early_stop_questions[:10], # 前10个早停例子
"all_correct_examples": all_correct_questions[:10], # 前10个全正确例子
"mixed_examples": mixed_questions[:5] # 前5个混合例子
}
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(report, f, ensure_ascii=False, indent=2)
print(f"分析报告已保存到: {output_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,59 @@
=== 采样结果统计 ===
题目标记分布:
hard_early_stop: 1494 道 (44.7%)
easy_all_correct: 1807 道 (54.1%)
unknown_fallback: 42 道 (1.3%)
关键指标:
早停困难题(答错后早停): 1494 道
全正确简单题(所有采样都答对): 1807 道
早停率: 44.7%
全正确率: 54.1%
=== API调用统计 ===
总生成调用: 13850
总验证调用: 13850
总API调用: 27700
平均每题调用: 8.3
早停题目平均采样次数: 2.0
全正确题目平均采样次数: 6.0
=== 各题型采样效果 ===
short_answer:
早停率: 36.9% (721/1954)
全正确率: 62.4% (1219/1954)
multiple_choice:
早停率: 58.8% (154/262)
全正确率: 39.3% (103/262)
calculation:
早停率: 66.0% (578/876)
全正确率: 31.4% (275/876)
true_false:
早停率: 16.3% (41/251)
全正确率: 83.7% (210/251)
=== 生成成功率统计 ===
总共处理: 3343 道题目
成功生成: 3343 道
使用备用方案: 0 道
成功率: 100.00%
=== 策略效果评估 ===
✅ 早停策略有效:成功识别出困难题目
困难题目数量: 1494 道
早停题目示例:
1. short_answer题第1次采样后早停
2. short_answer题第1次采样后早停
3. short_answer题第3次采样后早停
✅ 全采样策略有效:识别出简单题目
简单题目数量: 1807 道
全正确题目示例:
1. short_answer题6次采样全部答对
2. short_answer题6次采样全部答对
3. true_false题6次采样全部答对
=== 优化建议 ===
• API调用次数偏高建议:
- 优化提示词提高首次生成质量
- 考虑减少最大采样次数
- 改进验证逻辑减少失败率

View File

@@ -0,0 +1,895 @@
[
{
"question": "What are the close-packed directions of an ideal hexagonal close-packed crystal structure?",
"choices": {
"text": [
"(11-20)",
"(0001)",
"(10-10)",
"(1-210)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "For many polymer materials, their tensile strength σi is a function of the number-average relative molecular mass Mn̅: the formula is given by σi = σ0 - A / Mn̅, where σ0 is the tensile strength at infinite molecular weight, and A is a constant. Given two types of poly(methyl methacrylate) with number-average relative molecular masses of 4×10^4 and 6×10^4, the corresponding tensile strengths are 107 MPa and 170 MPa, respectively. Determine the tensile strength σb when the number-average relative molecular mass is 3×10^4.",
"choices": {
"text": [
"44 MPa",
"68 MPa",
"89 MPa",
"125 MPa"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Suppose that solid nickel was able to nucleate homogeneously with an undercooling of only 22°C. How many atoms would have to group together spontaneously for this to occur? Assume that the lattice parameter of the solid FCC nickel is 0.356nm.",
"choices": {
"text": [
"1.136 × 10^{6}",
"5.68 × 10^{5}",
"2.272 × 10^{6}",
"3.408 × 10^{6}"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which of the following best describes the habit plane and its invariance in the characteristics of martensitic transformation?",
"choices": {
"text": [
"Martensite forms on certain crystallographic planes of the parent phase, and these planes are called habit planes. The habit plane is an undistorted and non-rotating plane.",
"The habit plane is defined as the interface between austenite and martensite that exhibits minimum elastic strain energy, often approximated as {111}γ in FCC to BCC transformations.",
"Habit planes correspond to the planes of maximum shear stress during transformation, typically {110} in BCC metals, where dislocation slip is easiest.",
"In martensitic transformations, the habit plane is always parallel to the twinning plane of the product phase, maintaining strict crystallographic symmetry."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "The bonding forces between adhesive and adherend surfaces are thought to be",
"choices": {
"text": [
"Electrostatic",
"Van der Waals forces",
"Hydrogen bonding",
"Mechanical interlocking"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Can all parts of a dislocation loop be edge dislocations? Why?",
"choices": {
"text": [
"Yes, if the Burgers vector is perpendicular to the plane of the loop, creating a prismatic dislocation",
"No, because dislocation loops must always contain both edge and screw components to maintain continuity",
"Yes, but only in FCC metals where the Schmid factor favors edge dislocation formation",
"No, because the Burgers vector must rotate with the dislocation line direction to satisfy conservation laws"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "What is the graphite content in Fe-3.6%C alloy?",
"choices": {
"text": [
"2.94% (calculated from the lever rule considering only graphite formation)",
"3.6% (total carbon content assuming all carbon forms graphite)",
"1.8% (based on metastable Fe-Fe3C system calculation)",
"4.2% (considering carbon solubility in austenite at eutectic temperature)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "In the ionic compound $\\mathbf{MgO}$, the cation most likely to replace $\\mathbf{Mg}^{2+}$ in the compound (given the radii (nm) of each cation: (${\\bf Mg}^{2+}$) 0.066, ($\\mathbb{C}a^{2+}$) 0.099, ($\\mathrm{Li^{+}}$) 0.066, ($\\mathbf{Fe}^{\\mathbf{2+}}$) 0.074) is",
"choices": {
"text": [
"Ca²⁺, due to its similar charge and higher polarizability compensating for the larger ionic radius",
"Li⁺, because its identical ionic radius and lower charge density would minimize lattice distortion",
"Fe²⁺, owing to its comparable ionic radius and matching charge state with Mg²⁺",
"Al³⁺, as its smaller radius (0.054 nm) and higher charge would enhance electrostatic stabilization"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "For a simple cubic crystal, pure bending of the (110) plane around the [001] axis will form what type of dislocations (specify the direction of the dislocation line and the Burgers vector)?",
"choices": {
"text": [
"Edge type, dislocation line direction=[001], Burgers vector=a[100] or a[010]",
"Screw type, dislocation line direction=[110], Burgers vector=a/2[110]",
"Mixed type, dislocation line direction=[111], Burgers vector=a/2[110]",
"Edge type, dislocation line direction=[110], Burgers vector=a[001]"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Why is interstitial diffusion normally more rapid than vacancy diffusion?",
"choices": {
"text": [
"Interstitial atoms have higher mobility due to their smaller size and the greater probability of finding adjacent empty interstitial sites",
"Vacancy diffusion requires overcoming a higher activation energy barrier due to lattice distortion effects",
"Interstitial diffusion benefits from lower coordination number of interstitial sites compared to substitutional sites",
"The concentration of interstitial defects is typically orders of magnitude higher than vacancy concentrations in most materials"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "At room temperature the electrical conductivity and the electron mobility for aluminum are 3.8 x 10^7 (Ω·m)^-1 and 0.0012 m^2/V·s, respectively. The number of free electrons per cubic meter for aluminum at room temperature is:",
"choices": {
"text": [
"1.98 x 10^29 m^-3",
"3.16 x 10^28 m^-3",
"2.65 x 10^29 m^-3",
"4.74 x 10^27 m^-3"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "What is the coordination number of the cation in the compound Cr2O3, given r(Cr3+)=0.064nm, r(O2-)=0.132nm?",
"choices": {
"text": [
"6",
"12.00",
"9.00",
"3.00"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which of the following best describes a hybrid composite?",
"choices": {
"text": [
"A composite reinforced with two or more different fiber materials in a single matrix",
"A composite combining ceramic and metallic phases to achieve graded properties",
"A laminate structure where different layers contain distinct reinforcement materials",
"A nanocomposite incorporating carbon nanotubes and graphene in a polymer matrix"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "When the oxygen partial pressure is increased, what changes will occur in the density of Zn1+xO?",
"choices": {
"text": [
"The density decreases due to reduced zinc interstitial concentration as x in Zn1+xO decreases",
"The density increases because higher oxygen partial pressure leads to more oxygen interstitials",
"The density remains unchanged as the Schottky defect equilibrium compensates for the change",
"The density first increases then decreases due to the formation of zinc vacancy-oxygen interstitial complexes"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "How do the plasticity and toughness of a metal with finer grains compare to the same metal with coarser grains?",
"choices": {
"text": [
"Better, due to increased grain boundary strengthening and dislocation accumulation capacity",
"Worse, because finer grains lead to higher stress concentration at triple junctions",
"Similar, as grain size primarily affects hardness rather than plasticity and toughness",
"Dependent on strain rate, with finer grains showing better toughness only at high strain rates"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Below the critical temperature Tc, superconductors possess complete which property?",
"choices": {
"text": [
"Electrical conductivity",
"Magnetic susceptibility",
"Thermal conductivity",
"Meissner effect"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "For an ASTM grain size number of 8, the number of grains per square inch with no magnification is:",
"choices": {
"text": [
"1.28 × 10^6 grains/in.^2",
"6.4 × 10^5 grains/in.^2",
"2.56 × 10^6 grains/in.^2",
"3.2 × 10^5 grains/in.^2"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "An aluminum bar 125mm (5.0 in.) long and having a square cross section 16.5mm (0.65 in.) on an edge is pulled in tension with a load of 66,700N(15,000 lb) and experiences an elongation of 0.43 mm(1.7 × 10^{-2} in.). Assuming that the deformation is entirely elastic, the modulus of elasticity of the aluminum is:",
"choices": {
"text": [
"71.2 GPa (10.4 × 10^{6} psi)",
"69.0 GPa (10.0 × 10^{6} psi)",
"73.5 GPa (10.7 × 10^{6} psi)",
"67.8 GPa (9.8 × 10^{6} psi)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "After metal undergoes cold plastic deformation, this phenomenon is called deformation strengthening or:",
"choices": {
"text": [
"work hardening",
"strain hardening",
"dislocation strengthening",
"precipitation hardening"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "When n spheres form a cubic close packing, what is the number of octahedral voids?",
"choices": {
"text": [
"Alternative to n",
"Opposite of n",
"n",
"Different from n"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "A plate of an alloy steel has a plane-strain fracture toughness of 50 MPa·{m}^{1 / 2}. If it is known that the largest surface crack is 0.5mm long, and that the value of Y is 1.1 , which of the following can be said about this plate when a tensile stress of 1200 MPa is applied?",
"choices": {
"text": [
"The plate will definitely fracture due to the stress intensity factor exceeding the fracture toughness",
"The plate will not fracture because the applied stress is below the yield strength of alloy steel",
"The plate may not fracture if the crack tip plasticity zone size exceeds the critical crack length",
"The plate will undergo stable crack growth but not catastrophic fracture at this stress level"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "What is the ratio of the number of tetrahedral voids to the number of O2- ions?",
"choices": {
"text": [
"2:1",
"1:1",
"1:2",
"4:1"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Given the melting point of Cu tm=1083°C, latent heat of fusion Lm=1.88×10^3 J/cm^3, and specific surface energy σ=1.44×10^5 J/cm^2. The critical nucleus radius for homogeneous nucleation of Cu at 853°C is:",
"choices": {
"text": [
"9.03×10^-10 m",
"1.44×10^-9 m",
"5.67×10^-10 m",
"2.88×10^-9 m"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "When water-based substances melt into a liquid state, their volume undergoes the phenomenon of .",
"choices": {
"text": [
"anomalous expansion due to hydrogen bonding rearrangement",
"contraction caused by increased molecular packing efficiency",
"volume invariance governed by the Clausius-Clapeyron relation",
"density fluctuation following the Maxwell-Boltzmann distribution"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Based on microstructural analysis, the volume of graphite in a gray cast iron accounts for 12%, and the volume of ferrite accounts for 88%. Determine the value of ωC (given that the density of graphite ρG=2.2 g/cm³, and the density of ferrite ρα=7.8 g/cm³).",
"choices": {
"text": [
"ωC=0.037",
"ωC=0.042",
"ωC=0.028",
"ωC=0.051"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "The density of Al2O3 is 3.8g/cm3. How many atoms are contained in 1g of Al2O3?",
"choices": {
"text": [
"2.95×10^22",
"1.18×10^22",
"3.54×10^22",
"5.90×10^22"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "A 2 in. × 8 in. × 10 in. iron casting is produced and, after cooling to room temperature, is found to weigh 43.9 lb. If all of the shrinkage occurs as pores with a diameter of 0.05 in., the number of shrinkage pores in the casting is:",
"choices": {
"text": [
"83,354 pores",
"76,218 pores",
"92,487 pores",
"65,932 pores"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "In the symmetric tilt grain boundary of face-centered cubic metal $\\mathrm{Cu}$, the spacing between two positive edge dislocations is $D=1000\\mathrm{nm}$. Assuming the extra half-plane of the edge dislocation is the (110) plane and $d_{110}=0.1278\\mathrm{~nm}$, what is the tilt angle $\\theta$ of the tilt grain boundary?",
"choices": {
"text": [
"0.0146°",
"0.0292°",
"0.0073°",
"0.0219°"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "For some viscoelastic polymers that are subjected to stress relaxation tests, the stress decays with time according to the equation σ(t) = σ(0) exp(-t/τ). A specimen of this polymer was pulled in tension to a strain of 0.6, with an initial stress level of 2.76 MPa (400 psi) that dropped to 1.72 MPa (250 psi) after 60s. Determine E_τ(10) for this material:",
"choices": {
"text": [
"4.25 MPa (616 psi)",
"3.82 MPa (554 psi)",
"2.89 MPa (419 psi)",
"5.17 MPa (750 psi)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Why are residual thermal stresses introduced into a glass piece when it is cooled?",
"choices": {
"text": [
"Differential cooling rates between surface and interior regions cause uneven contraction, establishing stresses due to limited deformation",
"Phase transformation from liquid to glassy state creates volume mismatch between amorphous and crystalline regions",
"Thermal expansion coefficient anisotropy in the glass structure leads to directional stress buildup",
"Viscous flow cessation at the glass transition temperature locks in molecular orientation stresses"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Why is the compressive strength of ceramics always higher than the tensile strength?",
"choices": {
"text": [
"Under tension, cracks propagate rapidly when reaching critical size, while under compression, cracks either close or propagate parallel to the compression axis",
"Ceramics have higher dislocation mobility under compression, allowing plastic deformation that increases strength",
"The ionic/covalent bonding in ceramics creates higher resistance to shear stresses than to normal stresses",
"The Weibull modulus for compressive loading is typically 3-5 times higher than for tensile loading in ceramics"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "What is the critical temperature Tc of a superconductor?",
"choices": {
"text": [
"The temperature at which the resistance abruptly drops to zero",
"The temperature at which the Meissner effect becomes fully observable",
"The temperature corresponding to the peak in specific heat capacity",
"The temperature where Cooper pairs begin to form but resistance remains finite"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Why is the diffusion coefficient of anions generally smaller than that of cations in ionic crystals?",
"choices": {
"text": [
"Anions typically occupy close-packed positions in the crystal lattice, requiring significant structural rearrangement for diffusion, while cations diffuse through interstitial sites with lower energy barriers",
"Anions have larger ionic radii than cations, leading to stronger electrostatic repulsion between neighboring anions that hinders their mobility",
"The higher electronegativity of anions creates stronger covalent bonding with surrounding cations, effectively trapping the anions in their lattice positions",
"Cations experience a lower activation energy for diffusion due to their smaller mass and higher vibrational frequency compared to anions"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "What is one major limitation of the iron-iron carbide phase diagram related to time-temperature relationships in terms of heat treatment and the development of microstructure?",
"choices": {
"text": [
"The diagram provides no indication as to the time-temperature relationships for the formation of pearlite, bainite, and spheroidite, all of which are composed of the equilibrium ferrite and cementite phases",
"The diagram fails to account for the kinetic effects of carbon diffusion rates during austenite decomposition, which critically influence phase transformation times",
"The diagram does not specify the exact cooling rates required to achieve martensitic transformation, which is a non-equilibrium phase",
"The diagram omits the critical temperature ranges for recrystallization and grain growth processes in ferrous alloys"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Given the diffusion coefficients for iron in nickel at two temperatures (1273 K with D = 9.4 x 10^-16 m^2/s and 1473 K with D = 2.4 x 10^-14 m^2/s), determine the values of D0 and the activation energy Qd.",
"choices": {
"text": [
"D0 = 2.2 x 10^-5 m^2/s and Qd = 252,400 J/mol",
"D0 = 1.8 x 10^-5 m^2/s and Qd = 245,000 J/mol",
"D0 = 3.5 x 10^-5 m^2/s and Qd = 260,000 J/mol",
"D0 = 2.2 x 10^-5 m^2/s and Qd = 235,000 J/mol"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "The activation energy for self-diffusion in copper is 49,300 cal / mol. A copper specimen creeps at 0.002 N / in·h when a stress of 15,000 psi is applied at 600°C. If the creep rate of copper is dependent on self-diffusion, determine the creep rate if the temperature is 800°C.",
"choices": {
"text": [
"0.4 N/in·h",
"0.08 N/in·h",
"0.02 N/in·h",
"0.004 N/in·h"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "For an edge dislocation line, the direction of its slip motion is (18) to the Burgers vector",
"choices": {
"text": [
"parallel",
"perpendicular",
"at 45° to the Burgers vector and dislocation line",
"anti-parallel with a 5° deviation due to Peierls stress"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Is any of the alloying elements expected to have unlimited solid solubility in copper? For copper: r_Cu=1.278 Å",
"choices": {
"text": [
"Ni: r=1.246 Å, φr=-2.5% (same crystal structure, similar electronegativity)",
"Ag: r=1.444 Å, φr=+13.0% (same column in periodic table, similar valence electron configuration)",
"Zn: r=1.332 Å, φr=+4.2% (common alloying element in brass, similar atomic size)",
"Pd: r=1.376 Å, φr=+7.7% (similar d-electron configuration, forms continuous solid solution at high temperatures)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which valence state of cations must be present in the exchange of originally adsorbed cations in clay for mud peptization?",
"choices": {
"text": [
"The presence of divalent cations is essential for maintaining the Stern layer stability during peptization",
"Trivalent cations are required to overcome the critical coagulation concentration in clay suspensions",
"Monovalent cations must dominate the exchange process to achieve effective double layer expansion",
"A balanced mixture of mono- and divalent cations is necessary for optimal zeta potential adjustment"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "In the ilmenite crystal structure of FeTiO3, which consists of an HCP arrangement of O2- ions, what fraction of the total tetrahedral sites will be occupied?",
"choices": {
"text": [
"No tetrahedral sites will be occupied",
"1/3 of the tetrahedral sites will be occupied by Fe2+ ions",
"1/2 of the tetrahedral sites will be occupied by Ti4+ ions",
"1/4 of the tetrahedral sites will be occupied by Fe2+ and Ti4+ ions in an ordered arrangement"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which of the following best describes constitutional supercooling?",
"choices": {
"text": [
"During the solidification of a solid solution alloy, the distribution of solute in the liquid phase changes, which alters the alloy's melting point. Even if the actual temperature distribution remains unchanged, the degree of supercooling at the solid-liquid interface front will vary. Therefore, the supercooling of a solid solution alloy is determined by both the changing alloy melting point and the actual temperature distribution. This type of supercooling caused by changes in liquid phase composition is called constitutional supercooling.",
"Constitutional supercooling occurs when the cooling rate exceeds the critical cooling rate for a given alloy, leading to a metastable undercooled liquid state that persists below the equilibrium solidification temperature due to kinetic limitations.",
"In constitutional supercooling, the solute partitioning coefficient causes a local increase in melting temperature ahead of the solidification front, resulting in a thermal gradient that opposes the direction of heat flow during solidification.",
"Constitutional supercooling is a phenomenon where the latent heat of fusion released during solidification creates a thermal barrier that prevents further crystal growth, requiring additional undercooling to overcome this energy barrier."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Many properties of clay are related to the types of adsorbed cations. Which of the following correctly represents the variation trend of the thixotropy of clay slurry when adsorbing the following different cations (use arrows to represent: small—large) H+ Al3+ Ba2+ Sr2+ Ca2+ Mg2+ NH4+ K+ Na+ Li+?",
"choices": {
"text": [
"H+ < Li+ < Na+ < K+ < NH4+ < Mg2+ < Ca2+ < Sr2+ < Ba2+ < Al3+",
"Li+ < Na+ < K+ < NH4+ < H+ < Mg2+ < Ca2+ < Sr2+ < Ba2+ < Al3+",
"Al3+ < Ba2+ < Sr2+ < Ca2+ < Mg2+ < NH4+ < K+ < Na+ < Li+ < H+",
"H+ < Al3+ < Ba2+ < Sr2+ < Ca2+ < Mg2+ < NH4+ < K+ < Na+ < Li+"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which of the following statements about ceramic materials and 'work hardening' is correct?",
"choices": {
"text": [
"Ceramic materials will not exhibit 'work hardening' after deformation, because ceramic materials cannot undergo plastic deformation",
"Ceramic materials can exhibit 'work hardening' through dislocation pile-up mechanisms similar to metals, but only at extremely high temperatures (>1500°C)",
"Ceramic materials show 'work hardening' only when they contain secondary phases that allow limited dislocation movement",
"Ceramic materials demonstrate 'work hardening' through twinning deformation mechanisms rather than dislocation motion"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Calculate the number of Fe3C particles per unit volume N_v, given the volume fraction of Fe3C phase φ_Fe3C=0.06 and the radius of spherical cementite particles r=10×10^-6 m.",
"choices": {
"text": [
"1.43×10^13 (1/m^3)",
"2.86×10^13 (1/m^3)",
"7.16×10^12 (1/m^3)",
"3.58×10^12 (1/m^3)"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "A single crystal test bar of an FCC metal with a cross-sectional area of 10cm² is subjected to a compression test along the axial direction. Given that the critical resolved shear stress is 0.1kgf/mm² and the initial orientation of the bar axis is [215], determine the axial pressure P at the onset of double slip (without considering physical hardening). The axial pressure P is:",
"choices": {
"text": [
"2450N",
"1960N",
"2940N",
"3430N"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Which plane among (100), (110), and (111) in a face-centered cubic crystal is the close-packed plane?",
"choices": {
"text": [
"(111) plane due to its highest planar density and hexagonal symmetry",
"(110) plane because of its diagonal atomic arrangement and intermediate packing efficiency",
"(100) plane when considering the slip systems in FCC crystals at elevated temperatures",
"(111) plane but only in the presence of stacking faults which modify the ideal packing sequence"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
},
{
"question": "Germanium to which 10^24 m^-3 As atoms have been added is an extrinsic semiconductor at room temperature, and virtually all the As atoms may be thought of as being ionized (i.e., one charge carrier exists for each As atom). Is this material:",
"choices": {
"text": [
"n-type, because As acts as a donor impurity introducing extra electrons",
"p-type, because the high doping concentration induces band inversion",
"compensated semiconductor, because the high doping level creates equal numbers of electrons and holes",
"intrinsic semiconductor, because at room temperature the thermal energy dominates the doping effects"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
}
]

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,593 @@
import json
from typing import Dict, Any, List, Optional, Tuple
import random
from collections import Counter
def convert_to_target_format(source_data: Dict[str, Any]) -> Optional[Dict[str, Any]]:
"""
将源JSON格式转换为目标格式
"""
if "generated_options" not in source_data:
return None
generated_options = source_data["generated_options"]
# 只处理单选题
if generated_options.get("question_type") != "multiple_choice":
return None
question = source_data.get("choice_question", "")
if not question:
return None
options = generated_options.get("options", {})
if len(options) != 4:
return None
correct_answer = generated_options.get("correct_answer", "")
if correct_answer not in ["A", "B", "C", "D"]:
return None
target_data = {
"question": question,
"choices": {
"text": [
options.get("A", ""),
options.get("B", ""),
options.get("C", ""),
options.get("D", "")
],
"label": ["A", "B", "C", "D"]
},
"answer": f"[ANSWER]{correct_answer}[/ANSWER]",
"prompt": "You are an expert in materials science. Please answer the following materials science question by selecting the correct option. You MUST include the letter of the correct answer at the end of your response within the following tags: [ANSWER] and [/ANSWER]. For example: [ANSWER]A[/ANSWER]."
}
return target_data
def extract_answer_from_question(question: Dict[str, Any]) -> Optional[str]:
"""从转换后的题目中提取答案选项"""
answer_text = question.get("answer", "")
if answer_text.startswith("[ANSWER]") and answer_text.endswith("[/ANSWER]"):
answer = answer_text[8:-9]
if answer in ["A", "B", "C", "D"]:
return answer
return None
def shuffle_question_options(question: Dict[str, Any], new_correct_answer: str) -> Dict[str, Any]:
"""
重新排列题目选项,使正确答案变为指定选项
Args:
question: 题目字典
new_correct_answer: 新的正确答案选项 (A/B/C/D)
Returns:
重新排列后的题目
"""
# 获取当前正确答案
current_answer = extract_answer_from_question(question)
if not current_answer:
return question
# 如果已经是目标答案,不需要改变
if current_answer == new_correct_answer:
return question
# 获取当前选项
choices = question.get("choices", {})
current_texts = choices.get("text", [])
current_labels = choices.get("label", ["A", "B", "C", "D"])
if len(current_texts) != 4 or len(current_labels) != 4:
return question
# 找到当前正确答案的索引
current_index = current_labels.index(current_answer)
new_index = current_labels.index(new_correct_answer)
# 交换选项
new_texts = current_texts[:]
new_texts[new_index], new_texts[current_index] = new_texts[current_index], new_texts[new_index]
# 创建新的题目
new_question = question.copy()
new_question["choices"] = {
"text": new_texts,
"label": ["A", "B", "C", "D"]
}
new_question["answer"] = f"[ANSWER]{new_correct_answer}[/ANSWER]"
return new_question
def balance_answer_distribution_by_shuffling(questions: List[Dict[str, Any]],
random_seed: Optional[int] = None) -> Tuple[List[Dict[str, Any]], Dict[str, Any]]:
"""
通过重新排列选项来平衡答案分布
Args:
questions: 题目列表
random_seed: 随机种子
Returns:
平衡后的题目列表和统计信息
"""
if random_seed is not None:
random.seed(random_seed)
total_questions = len(questions)
target_per_answer = total_questions // 4
remainder = total_questions % 4
print(f"\n=== 答案分布平衡 (重排选项法) ===")
print(f"总题目数: {total_questions}")
print(f"标准分配: 每个选项 {target_per_answer} 道题")
if remainder > 0:
print(f"余数: {remainder} 道题 (将分配给前{remainder}个选项)")
# 统计当前答案分布
answer_groups = {"A": [], "B": [], "C": [], "D": []}
for i, question in enumerate(questions):
answer = extract_answer_from_question(question)
if answer and answer in answer_groups:
answer_groups[answer].append((i, question))
print(f"\n当前答案分布:")
for answer in ["A", "B", "C", "D"]:
count = len(answer_groups[answer])
ratio = count / total_questions if total_questions > 0 else 0
print(f" {answer}: {count} ({ratio*100:.1f}%)")
# 计算目标分配前remainder个选项多分配1道题
target_counts = {}
for i, answer in enumerate(["A", "B", "C", "D"]):
if i < remainder:
target_counts[answer] = target_per_answer + 1
else:
target_counts[answer] = target_per_answer
print(f"\n目标分配:")
for answer in ["A", "B", "C", "D"]:
print(f" {answer}: {target_counts[answer]} 道题")
# 计算需要调整的数量
surplus_questions = [] # (question_index, question, from_answer)
deficit_needed = [] # (to_answer, count_needed)
for answer in ["A", "B", "C", "D"]:
current_count = len(answer_groups[answer])
target_count = target_counts[answer]
difference = current_count - target_count
if difference > 0:
# 有多余的题目,需要转移出去
print(f" {answer}: 多 {difference} 道题")
# 随机选择要转移的题目
questions_to_move = random.sample(answer_groups[answer], difference)
for q_idx, q in questions_to_move:
surplus_questions.append((q_idx, q, answer))
elif difference < 0:
# 缺少题目,需要接收
needed = -difference
print(f" {answer}: 少 {needed} 道题")
deficit_needed.extend([(answer, 1)] * needed)
# 打乱顺序以避免偏向性
random.shuffle(surplus_questions)
random.shuffle(deficit_needed)
# 执行调整
balanced_questions = questions[:] # 复制原题目列表
print(f"\n开始重新分配 {len(surplus_questions)} 道题:")
for i, ((q_idx, question, from_answer), (to_answer, _)) in enumerate(zip(surplus_questions, deficit_needed)):
# 重新排列这道题的选项
new_question = shuffle_question_options(question, to_answer)
balanced_questions[q_idx] = new_question
print(f"{i+1}次调整: 题目{q_idx+1} 答案从 {from_answer} 改为 {to_answer}")
# 验证最终分布
final_counter = Counter()
for question in balanced_questions:
answer = extract_answer_from_question(question)
if answer:
final_counter[answer] += 1
print(f"\n平衡后答案分布:")
max_deviation = 0
target_ratio = 0.25
for answer in ["A", "B", "C", "D"]:
count = final_counter.get(answer, 0)
ratio = count / total_questions if total_questions > 0 else 0
deviation = abs(ratio - target_ratio)
max_deviation = max(max_deviation, deviation)
print(f" {answer}: {count} ({ratio*100:.1f}%)")
# 统计信息
balance_info = {
"original_total": total_questions,
"final_total": total_questions, # 题目总数不变
"target_per_answer": target_per_answer,
"remainder": remainder,
"final_distribution": dict(final_counter),
"max_deviation": max_deviation,
"adjustments_made": len(surplus_questions),
"perfectly_balanced": max_deviation <= 0.05
}
if balance_info["perfectly_balanced"]:
print(f"✅ 完美平衡!最大偏差: {max_deviation*100:.1f}%")
else:
print(f"📊 接近平衡,最大偏差: {max_deviation*100:.1f}%")
print(f"总共调整了 {balance_info['adjustments_made']} 道题的答案")
return balanced_questions, balance_info
def classify_questions_by_difficulty(questions: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
"""
按难度分类题目
Args:
questions: 题目列表
Returns:
按难度分类的题目字典
"""
difficulty_groups = {
"hard_early_stop": [], # 困难题(答错后早停)
"easy_all_correct": [], # 简单题(所有采样都答对)
"mixed": [], # 混合题(部分对部分错)
"unknown": [] # 未知难度
}
for question in questions:
generated_options = question.get("generated_options", {})
sampling_summary = generated_options.get("sampling_summary", {})
difficulty_label = sampling_summary.get("difficulty_label", "unknown")
if difficulty_label in difficulty_groups:
difficulty_groups[difficulty_label].append(question)
else:
difficulty_groups["unknown"].append(question)
return difficulty_groups
def select_questions_by_ratio(difficulty_groups: Dict[str, List[Dict[str, Any]]],
selection_ratios: Dict[str, float],
random_seed: Optional[int] = None) -> Tuple[List[Dict[str, Any]], Dict[str, int]]:
"""
按比例选择题目
Args:
difficulty_groups: 按难度分类的题目
selection_ratios: 各难度等级的选择比例 (0.0-1.0)
random_seed: 随机种子
Returns:
选中的题目列表和选择统计信息
"""
if random_seed is not None:
random.seed(random_seed)
selected_questions = []
selection_stats = {}
for difficulty, questions in difficulty_groups.items():
total_count = len(questions)
ratio = selection_ratios.get(difficulty, 0.0)
# 计算要选择的题目数量
if ratio <= 0:
selected_count = 0
elif ratio >= 1:
selected_count = total_count
else:
selected_count = int(total_count * ratio)
# 随机选择题目
if selected_count > 0 and total_count > 0:
if selected_count >= total_count:
selected = questions
else:
selected = random.sample(questions, selected_count)
selected_questions.extend(selected)
else:
selected = []
# 记录统计信息
selection_stats[difficulty] = {
"total": total_count,
"selected": len(selected),
"ratio_target": ratio,
"ratio_actual": len(selected) / total_count if total_count > 0 else 0
}
# 打乱最终题目顺序
random.shuffle(selected_questions)
return selected_questions, selection_stats
def batch_convert_questions_with_difficulty_filter(input_file: str,
output_file: str,
selection_ratios: Dict[str, float],
balance_answers: bool = True,
random_seed: Optional[int] = None) -> None:
"""
批量转换题目格式,支持按难度筛选和答案平衡
Args:
input_file: 输入文件路径
output_file: 输出文件路径
selection_ratios: 各难度等级的选择比例
balance_answers: 是否平衡答案分布
random_seed: 随机种子
"""
print("=== 批量转换题目(难度筛选 + 答案平衡)===")
print(f"输入文件: {input_file}")
print(f"输出文件: {output_file}")
print(f"答案平衡: {'开启' if balance_answers else '关闭'}")
print(f"随机种子: {random_seed}")
# 加载数据
print("\n正在加载数据...")
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 处理两种可能的输入格式
if isinstance(data, dict) and "questions" in data:
source_questions = data["questions"]
print(f"检测到完整格式数据,包含其他元数据")
elif isinstance(data, list):
source_questions = data
print(f"检测到题目列表格式")
else:
raise ValueError("不支持的输入文件格式")
print(f"加载了 {len(source_questions)} 道题目")
# 按难度分类题目
print("\n正在按难度分类题目...")
difficulty_groups = classify_questions_by_difficulty(source_questions)
print("题目难度分布:")
total_multiple_choice = 0
for difficulty, questions in difficulty_groups.items():
# 统计该难度下的单选题数量
mc_count = sum(1 for q in questions
if q.get("generated_options", {}).get("question_type") == "multiple_choice")
total_multiple_choice += mc_count
print(f" {difficulty}: {len(questions)} 道总题目, {mc_count} 道单选题")
print(f"可转换的单选题总数: {total_multiple_choice}")
# 按比例选择题目
print("\n正在按比例选择题目...")
print("选择比例设置:")
for difficulty, ratio in selection_ratios.items():
if difficulty in difficulty_groups:
print(f" {difficulty}: {ratio*100:.1f}%")
selected_questions, selection_stats = select_questions_by_ratio(
difficulty_groups, selection_ratios, random_seed
)
print(f"\n题目选择结果:")
total_selected = 0
for difficulty, stats in selection_stats.items():
print(f" {difficulty}:")
print(f" 总数: {stats['total']}")
print(f" 选中: {stats['selected']}")
print(f" 目标比例: {stats['ratio_target']*100:.1f}%")
print(f" 实际比例: {stats['ratio_actual']*100:.1f}%")
total_selected += stats['selected']
print(f"总共选中: {total_selected} 道题目")
# 转换选中的题目
print("\n正在转换题目格式...")
converted_questions = []
conversion_stats = {
"selected": total_selected,
"multiple_choice": 0,
"true_false": 0,
"other": 0,
"converted": 0,
"failed": 0
}
for i, question in enumerate(selected_questions):
try:
# 统计题目类型
generated_options = question.get("generated_options", {})
question_type = generated_options.get("question_type", "unknown")
if question_type == "multiple_choice":
conversion_stats["multiple_choice"] += 1
elif question_type == "true_false":
conversion_stats["true_false"] += 1
else:
conversion_stats["other"] += 1
# 转换题目
converted = convert_to_target_format(question)
if converted:
converted_questions.append(converted)
conversion_stats["converted"] += 1
else:
conversion_stats["failed"] += 1
except Exception as e:
print(f"{i+1}题转换失败: {e}")
conversion_stats["failed"] += 1
print(f"转换完成: {conversion_stats['converted']} 道题目成功转换")
# 对转换后的题目进行答案分布平衡
balance_info = None
if balance_answers and converted_questions:
print("\n正在对转换后的题目进行答案分布平衡...")
balanced_questions, balance_info = balance_answer_distribution_by_shuffling(
converted_questions,
random_seed=random_seed
)
converted_questions = balanced_questions
conversion_stats["final_count"] = len(converted_questions)
# 保存结果
print("正在保存转换结果...")
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(converted_questions, f, ensure_ascii=False, indent=2)
# 打印最终统计信息
print(f"\n=== 转换完成!===")
print(f"选中题目数: {conversion_stats['selected']}")
print(f"单选题: {conversion_stats['multiple_choice']}")
print(f"判断题: {conversion_stats['true_false']}")
print(f"其他类型: {conversion_stats['other']}")
print(f"成功转换: {conversion_stats['converted']}")
print(f"转换失败: {conversion_stats['failed']}")
if balance_answers and balance_info:
print(f"答案平衡后: {conversion_stats.get('final_count', conversion_stats['converted'])}")
print(f"调整题目数: {balance_info['adjustments_made']}")
print(f"最终转换率: {conversion_stats.get('final_count', conversion_stats['converted'])/conversion_stats['selected']*100:.1f}%")
else:
print(f"最终转换率: {conversion_stats['converted']/conversion_stats['selected']*100:.1f}%")
print(f"结果已保存到: {output_file}")
def validate_converted_questions(questions: List[Dict[str, Any]]) -> Dict[str, int]:
"""
验证转换后的题目格式
"""
stats = {
"total": len(questions),
"valid": 0,
"invalid": 0,
"missing_question": 0,
"invalid_choices": 0,
"invalid_answer": 0
}
for i, q in enumerate(questions):
is_valid = True
# 检查question字段
if not q.get("question", "").strip():
stats["missing_question"] += 1
is_valid = False
# 检查choices字段
choices = q.get("choices", {})
text_list = choices.get("text", [])
label_list = choices.get("label", [])
if (len(text_list) != 4 or len(label_list) != 4 or
label_list != ["A", "B", "C", "D"] or
any(not str(text).strip() for text in text_list)):
stats["invalid_choices"] += 1
is_valid = False
# 检查answer字段
answer = q.get("answer", "")
if not (answer.startswith("[ANSWER]") and answer.endswith("[/ANSWER]") and
answer[8:-9] in ["A", "B", "C", "D"]):
stats["invalid_answer"] += 1
is_valid = False
if is_valid:
stats["valid"] += 1
else:
stats["invalid"] += 1
return stats
def main():
"""主函数"""
# 文件路径配置
INPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepy_complete_choice_questions_with_sampling.json"
OUTPUT_FILE = "/home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered_only_hard.json"
# 难度选择比例配置
SELECTION_RATIOS = {
"hard_early_stop": 1.0, # 困难题选择10%
"easy_all_correct": 0.0, # 简单题选择3.5%
"mixed": 0.0, # 混合题选择0%
"unknown": 0.0 # 未知难度不选择
}
# 随机种子,保证结果可复现
RANDOM_SEED = 42
# 是否启用答案平衡
BALANCE_ANSWERS = True
try:
# 显示配置信息
print("=== 难度筛选配置 ===")
print("选择比例:")
for difficulty, ratio in SELECTION_RATIOS.items():
print(f" {difficulty}: {ratio*100:.1f}%")
print(f"随机种子: {RANDOM_SEED}")
print(f"启用答案平衡: {BALANCE_ANSWERS}")
print()
# 批量转换(包含难度筛选和答案平衡)
batch_convert_questions_with_difficulty_filter(
INPUT_FILE,
OUTPUT_FILE,
SELECTION_RATIOS,
balance_answers=BALANCE_ANSWERS,
random_seed=RANDOM_SEED
)
# 验证转换结果
print("\n正在验证转换结果...")
with open(OUTPUT_FILE, 'r', encoding='utf-8') as f:
result_data = json.load(f)
validation_stats = validate_converted_questions(result_data)
print(f"\n=== 验证结果 ===")
print(f"总题目数: {validation_stats['total']}")
print(f"格式正确: {validation_stats['valid']}")
print(f"格式错误: {validation_stats['invalid']}")
if validation_stats['invalid'] > 0:
print(f" 缺少题目: {validation_stats['missing_question']}")
print(f" 选项格式错误: {validation_stats['invalid_choices']}")
print(f" 答案格式错误: {validation_stats['invalid_answer']}")
print(f"格式正确率: {validation_stats['valid']/validation_stats['total']*100:.1f}%")
# 验证最终答案分布
if BALANCE_ANSWERS:
print(f"\n=== 最终答案分布验证 ===")
final_answers = []
for q in result_data:
answer = extract_answer_from_question(q)
if answer:
final_answers.append(answer)
final_counter = Counter(final_answers)
total = len(final_answers)
for answer in ["A", "B", "C", "D"]:
count = final_counter.get(answer, 0)
ratio = count / total if total > 0 else 0
print(f" {answer}: {count} ({ratio*100:.1f}%)")
except Exception as e:
print(f"程序执行失败: {e}")
import traceback
traceback.print_exc()
if __name__ == "__main__":
main()

View File

@@ -1,40 +0,0 @@
2025-05-28 15:30:36,536 - __main__ - INFO - Starting multi-model evaluation framework
2025-05-28 15:30:36,536 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
2025-05-28 15:30:36,543 - __main__ - INFO - Output directory: results/20250528_1530
2025-05-28 15:30:36,543 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:30:36,568 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:30:36,569 - src.data_loader - INFO - Validated 3023 out of 3023 items
2025-05-28 15:30:36,569 - __main__ - INFO - Loaded 3023 valid data items
2025-05-28 15:30:36,569 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
2025-05-28 15:30:36,569 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
2025-05-28 15:30:36,595 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:30:38,447 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,461 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,485 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,499 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,549 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,613 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:38,630 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:39,998 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:40,267 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:40,287 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:30:40,288 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:30:40,302 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1530/qwen-max-2025-01-25.json
2025-05-28 15:30:40,302 - __main__ - INFO - Evaluating model 2/2: gpt-4o
2025-05-28 15:30:40,302 - __main__ - INFO - Starting evaluation for model: gpt-4o
2025-05-28 15:30:40,352 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:30:41,778 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:41,794 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:41,826 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,016 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,026 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,040 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,041 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,076 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,295 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:30:42,323 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:30:42,323 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:30:42,333 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1530/gpt-4o.json
2025-05-28 15:30:42,333 - __main__ - ERROR - Evaluation failed: 'summary_filename'

View File

@@ -1,41 +0,0 @@
2025-05-28 15:31:25,896 - __main__ - INFO - Starting multi-model evaluation framework
2025-05-28 15:31:25,896 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
2025-05-28 15:31:25,899 - __main__ - INFO - Output directory: results/20250528_1531
2025-05-28 15:31:25,899 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:31:25,925 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:31:25,927 - src.data_loader - INFO - Validated 3023 out of 3023 items
2025-05-28 15:31:25,927 - __main__ - INFO - Loaded 3023 valid data items
2025-05-28 15:31:25,927 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
2025-05-28 15:31:25,927 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
2025-05-28 15:31:25,952 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:31:28,342 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,434 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,444 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,538 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:28,703 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:30,085 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:30,353 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:30,374 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:31:30,374 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:31:30,387 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1531/qwen-max-2025-01-25.json
2025-05-28 15:31:30,387 - __main__ - INFO - Evaluating model 2/2: gpt-4o
2025-05-28 15:31:30,387 - __main__ - INFO - Starting evaluation for model: gpt-4o
2025-05-28 15:31:30,436 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:31:31,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:31,886 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,119 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,139 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,140 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,162 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,449 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:32,539 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:38,330 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:31:38,351 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:31:38,351 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:31:38,366 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1531/gpt-4o.json
2025-05-28 15:31:38,372 - __main__ - INFO - Summary saved to results/20250528_1531/summary.json
2025-05-28 15:31:38,372 - __main__ - INFO - Multi-model evaluation completed successfully

View File

@@ -1,44 +0,0 @@
2025-05-28 15:35:59,778 - __main__ - INFO - Starting multi-model evaluation framework
2025-05-28 15:35:59,779 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o']
2025-05-28 15:35:59,782 - __main__ - INFO - Output directory: results/20250528_1535
2025-05-28 15:35:59,782 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:35:59,808 - src.data_loader - INFO - Successfully loaded 3023 items from /home/ubuntu/50T/LYT/MatBench/layer1/ALL-merge/merged.json
2025-05-28 15:35:59,809 - src.data_loader - INFO - Validated 3023 out of 3023 items
2025-05-28 15:35:59,809 - __main__ - INFO - Loaded 3023 valid data items
2025-05-28 15:35:59,809 - __main__ - INFO - Evaluating model 1/2: qwen-max-2025-01-25
2025-05-28 15:35:59,809 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
2025-05-28 15:35:59,835 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:36:01,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,780 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,809 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,853 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,876 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:01,910 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:02,847 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:02,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:03,432 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:03,454 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:36:03,454 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:36:03,477 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250528_1535/qwen-max-2025-01-25.json
2025-05-28 15:36:03,480 - __main__ - INFO - Evaluating model 2/2: gpt-4o
2025-05-28 15:36:03,481 - __main__ - INFO - Starting evaluation for model: gpt-4o
2025-05-28 15:36:03,534 - src.evaluator - INFO - Starting evaluation with 8 workers
2025-05-28 15:36:04,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,895 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,930 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:04,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:05,474 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:05,495 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-05-28 15:36:05,514 - src.metrics - INFO - Metrics computed successfully
2025-05-28 15:36:05,515 - src.evaluator - INFO - Evaluation completed successfully
2025-05-28 15:36:05,532 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250528_1535/gpt-4o.json
2025-05-28 15:36:05,564 - root - WARNING - openpyxl not installed, skipping Excel export
2025-05-28 15:36:05,564 - root - INFO - Summary saved to results/20250528_1535/summary.json
2025-05-28 15:36:05,564 - root - INFO - CSV summary saved to results/20250528_1535/summary.csv
2025-05-28 15:36:05,568 - __main__ - INFO - Summary saved to results/20250528_1535/summary.json
2025-05-28 15:36:05,568 - __main__ - INFO - Multi-model evaluation completed successfully

View File

@@ -0,0 +1,823 @@
2025-06-02 17:06:22,367 - __main__ - INFO - Starting multi-model evaluation framework
2025-06-02 17:06:22,367 - __main__ - INFO - Using models from config: ['qwen-max-2025-01-25', 'gpt-4o', 'deepseek-chat', 'claude-sonnet-4-20250514']
2025-06-02 17:06:22,375 - __main__ - INFO - Output directory: results/20250602_1706
2025-06-02 17:06:22,375 - __main__ - INFO - Loading data from /home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json
2025-06-02 17:06:22,383 - src.data_loader - INFO - Successfully loaded 197 items from /home/ubuntu/50T/LYT/MatBench/layer2/PGEE/code/stepz_final_choice_questions_filtered.json
2025-06-02 17:06:22,383 - src.data_loader - INFO - Validated 197 out of 197 items
2025-06-02 17:06:22,383 - __main__ - INFO - Loaded 197 valid data items
2025-06-02 17:06:22,383 - __main__ - INFO - Evaluating model 1/4: qwen-max-2025-01-25
2025-06-02 17:06:22,383 - __main__ - INFO - Starting evaluation for model: qwen-max-2025-01-25
2025-06-02 17:06:22,397 - src.evaluator - INFO - Starting evaluation with 20 workers
2025-06-02 17:06:27,556 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:28,067 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:28,083 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:29,235 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:29,580 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:30,505 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:30,927 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:31,133 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:32,566 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:33,931 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:33,975 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:34,202 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:34,535 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:34,651 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:35,604 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:35,674 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:35,850 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:36,938 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:37,850 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:39,253 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:39,873 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:40,638 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:40,905 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:41,205 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:42,082 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:42,084 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:42,278 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:42,320 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:44,775 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:44,966 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:45,105 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:45,124 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:45,379 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:47,826 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:48,141 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:49,054 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:49,407 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:50,257 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:50,816 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:50,916 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:53,824 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:54,561 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:56,255 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:56,377 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:56,757 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:57,682 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:58,631 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:59,155 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:06:59,336 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:00,072 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:00,698 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:01,346 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:01,703 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:02,402 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:03,950 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:05,043 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:06,978 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:07,273 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:08,481 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:09,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:09,926 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:11,147 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:12,260 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:13,947 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:14,365 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:14,848 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:15,525 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:16,906 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:18,510 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:18,512 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:18,900 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:19,968 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:20,000 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:20,875 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:21,000 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:21,438 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:21,867 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:23,150 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:23,208 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:23,496 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:23,746 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:24,163 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:24,490 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:25,599 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:25,830 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:25,872 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:26,360 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:27,368 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:31,526 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:31,655 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:32,287 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:33,964 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:34,321 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:36,604 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:36,804 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:36,898 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:37,168 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:37,843 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:39,394 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:40,203 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:40,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:40,380 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:40,732 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:40,884 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:41,842 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:43,360 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:43,858 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:44,498 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:45,213 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:45,689 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:46,445 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:47,758 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:48,461 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:49,233 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:50,428 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:53,021 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:53,065 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:53,991 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:55,105 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:55,609 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:57,173 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:57,914 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:57,946 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:58,589 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:59,048 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:59,236 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:07:59,336 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:00,414 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:00,825 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:04,168 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:04,638 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:05,529 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:06,329 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:06,668 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:07,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:07,723 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:09,602 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:09,692 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:10,664 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:12,074 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:13,327 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:13,415 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:13,528 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:14,298 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:16,775 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:17,697 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:19,766 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:20,612 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:21,645 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:22,199 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:22,259 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:22,423 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:23,045 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:24,696 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:25,524 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:25,648 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:26,453 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:27,423 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:28,450 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:29,077 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:29,338 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:29,723 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:31,768 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:32,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:33,146 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:33,286 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:33,638 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:34,403 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:34,648 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:35,500 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:36,198 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:36,448 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:36,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:37,398 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:38,931 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:38,994 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:39,068 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:40,378 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:40,760 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:42,257 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:43,965 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:44,045 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:44,499 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:44,698 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:45,413 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:45,431 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:45,694 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:47,039 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:48,616 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:49,185 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:57,886 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:08:59,704 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:03,029 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:08,130 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:09,249 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:11,560 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:23,846 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:24,096 - src.metrics - INFO - Metrics computed successfully
2025-06-02 17:09:24,098 - src.evaluator - INFO - Evaluation completed successfully
2025-06-02 17:09:24,221 - __main__ - INFO - Model qwen-max-2025-01-25 evaluation completed. Results saved to results/20250602_1706/qwen-max-2025-01-25.json
2025-06-02 17:09:24,225 - __main__ - INFO - Evaluating model 2/4: gpt-4o
2025-06-02 17:09:24,226 - __main__ - INFO - Starting evaluation for model: gpt-4o
2025-06-02 17:09:24,237 - src.evaluator - INFO - Starting evaluation with 20 workers
2025-06-02 17:09:26,567 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,760 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,773 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,823 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,863 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,933 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:26,949 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,104 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,243 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,288 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,324 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,587 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,706 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:27,841 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,065 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,067 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,164 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,196 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,243 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:28,775 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,114 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,117 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,148 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,325 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,610 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,669 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:29,932 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:30,034 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:30,486 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:30,708 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:30,871 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:31,115 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:31,359 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:31,639 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:31,730 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:31,803 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:32,336 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:32,737 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:32,921 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:32,924 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:32,938 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:33,038 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:33,706 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:33,777 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:33,961 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,013 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,026 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,139 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,297 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,351 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,589 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,724 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,848 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:34,865 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,308 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,399 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,647 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,672 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,686 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:35,926 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:36,141 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:36,705 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:36,732 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:36,969 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:37,670 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:37,884 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:37,887 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:38,022 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:38,115 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:38,345 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,185 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,202 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,214 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,262 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,359 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,616 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:39,769 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:40,083 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:40,291 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:40,543 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:40,657 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,135 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,175 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,189 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,291 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,596 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,622 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:41,710 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,050 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,196 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,318 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,321 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,578 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,724 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:42,970 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:43,311 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:43,327 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:43,561 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:43,784 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:43,873 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:44,227 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:44,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:44,982 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,016 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,126 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,190 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,221 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,411 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,500 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,604 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:45,933 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:46,349 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:46,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:46,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:46,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:46,928 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,025 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,078 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,089 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,131 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,864 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:47,961 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:48,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:48,161 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:48,245 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:48,300 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:48,934 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:49,030 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,192 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,194 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,203 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,252 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,452 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,454 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,459 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:50,763 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:51,289 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:51,381 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:51,777 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:51,834 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,055 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,250 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,305 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,464 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,496 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,553 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:52,555 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,097 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,266 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,367 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,495 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,511 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,649 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,766 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,779 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,810 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,909 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:53,911 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:54,247 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:54,422 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:54,792 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:54,864 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:54,978 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:55,274 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:55,290 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:55,834 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:55,861 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:55,928 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:56,031 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:56,323 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:56,480 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:56,537 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:57,175 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:57,397 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:57,740 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,008 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,010 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,254 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,356 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,409 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,537 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:58,783 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:59,056 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:59,110 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:59,179 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:59,190 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:09:59,199 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:00,382 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:00,603 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:00,606 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:00,688 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:01,458 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:04,027 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:04,382 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:05,084 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:06,126 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:06,195 - src.metrics - INFO - Metrics computed successfully
2025-06-02 17:10:06,196 - src.evaluator - INFO - Evaluation completed successfully
2025-06-02 17:10:06,322 - __main__ - INFO - Model gpt-4o evaluation completed. Results saved to results/20250602_1706/gpt-4o.json
2025-06-02 17:10:06,324 - __main__ - INFO - Evaluating model 3/4: deepseek-chat
2025-06-02 17:10:06,325 - __main__ - INFO - Starting evaluation for model: deepseek-chat
2025-06-02 17:10:06,337 - src.evaluator - INFO - Starting evaluation with 20 workers
2025-06-02 17:10:14,175 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:14,676 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:14,999 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:15,243 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:15,535 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:16,332 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:16,988 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:18,787 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:19,008 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:20,456 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:20,572 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:20,593 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:20,933 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:21,496 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:21,626 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:21,642 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:22,437 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:22,965 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:23,906 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:24,801 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:25,880 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:27,693 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:29,599 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:31,688 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:32,170 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:32,713 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:32,874 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:33,633 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:34,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:34,451 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:35,599 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:36,176 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:37,368 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:38,398 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:38,464 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:39,129 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:40,059 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:40,664 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:40,666 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:42,099 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:42,664 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:43,724 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:45,162 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:46,733 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:48,045 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:48,914 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:49,596 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:50,158 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:50,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:50,711 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:50,729 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:53,213 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:53,260 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:53,580 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:54,922 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:55,271 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:55,721 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:58,319 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:10:58,551 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:03,062 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:03,629 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:03,698 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:03,802 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:05,296 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:05,729 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:07,539 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:08,059 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:08,762 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:09,200 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:09,689 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:09,780 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:12,117 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:12,149 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:12,266 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:13,327 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:15,324 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:16,121 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:16,393 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:16,662 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:17,062 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:18,199 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:19,246 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:19,371 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:19,963 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:21,835 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:23,968 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:24,435 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:25,015 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:25,068 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:25,228 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:25,573 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:27,025 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:27,175 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:27,452 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:28,007 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:28,678 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:30,475 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:30,752 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:32,818 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:32,851 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:33,668 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:34,634 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:35,885 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:38,443 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:38,794 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:39,096 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:40,712 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:40,756 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:40,840 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:41,311 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:42,043 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:42,593 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:42,695 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:44,777 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:47,394 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:47,773 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:48,054 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:48,503 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:48,997 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:49,029 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:50,701 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:51,891 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:52,709 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:54,569 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:54,736 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:54,739 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:54,896 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:55,848 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:56,348 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:56,929 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:57,619 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:58,070 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:11:59,492 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:00,645 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:01,555 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:02,216 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:04,727 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:05,112 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:05,832 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:06,114 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:06,328 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:06,514 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:06,530 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:06,841 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:09,543 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:09,815 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:11,223 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:12,259 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:13,900 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:14,486 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:15,390 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:16,015 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:16,916 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:18,076 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:21,148 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:21,885 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:22,544 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:23,156 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:24,151 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:24,209 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:25,215 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:25,755 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:25,970 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:27,300 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:27,815 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:28,303 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:29,098 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:30,417 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:30,847 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:32,608 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:32,689 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:32,701 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:32,901 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:33,983 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:34,628 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:37,382 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:38,602 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:40,085 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:41,082 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:41,310 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:41,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:42,148 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:42,681 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:43,604 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:43,919 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:44,574 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:44,646 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:45,770 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:47,420 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:48,816 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:49,959 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:52,279 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:53,424 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:59,771 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:12:59,802 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:01,793 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:08,510 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:08,580 - src.metrics - INFO - Metrics computed successfully
2025-06-02 17:13:08,581 - src.evaluator - INFO - Evaluation completed successfully
2025-06-02 17:13:08,712 - __main__ - INFO - Model deepseek-chat evaluation completed. Results saved to results/20250602_1706/deepseek-chat.json
2025-06-02 17:13:08,714 - __main__ - INFO - Evaluating model 4/4: claude-sonnet-4-20250514
2025-06-02 17:13:08,715 - __main__ - INFO - Starting evaluation for model: claude-sonnet-4-20250514
2025-06-02 17:13:08,726 - src.evaluator - INFO - Starting evaluation with 20 workers
2025-06-02 17:13:18,120 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:18,308 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:18,972 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:19,506 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:19,790 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:20,070 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:20,129 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:20,773 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:22,992 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:23,429 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:23,695 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:25,992 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:27,120 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:27,215 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:28,188 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:28,627 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:28,642 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:28,900 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:29,068 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:29,926 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:31,295 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:31,641 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:32,637 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:33,734 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:35,334 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:35,636 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:35,949 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:36,367 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:36,622 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:38,493 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:39,060 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:40,029 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:40,998 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:41,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:43,149 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:43,253 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:44,215 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:44,819 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:45,945 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:46,359 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:46,575 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:48,250 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:48,344 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:49,232 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:49,375 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:50,233 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:50,442 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:51,208 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:51,497 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:53,654 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:53,671 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:55,609 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:55,706 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:56,709 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:57,206 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:57,697 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:57,773 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:58,546 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:58,777 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:59,365 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:13:59,554 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:00,390 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:00,547 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:01,556 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:02,049 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:03,059 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:05,096 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:05,138 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:05,240 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:05,267 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:07,179 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:07,543 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:08,209 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:08,991 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:09,237 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:09,774 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:11,124 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:11,789 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:12,641 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:12,976 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:13,419 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:13,662 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:14,242 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:15,080 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:15,197 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:15,368 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:15,820 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:16,260 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:16,304 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:16,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:17,313 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:18,532 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:19,372 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:21,172 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:21,818 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:22,137 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:23,878 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:24,654 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:25,249 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:25,641 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:26,055 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:26,174 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:27,143 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:27,285 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:27,960 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:28,337 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:28,470 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:28,643 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:29,530 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:30,163 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:30,375 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:32,492 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:33,179 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:34,128 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:34,403 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:34,471 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:35,144 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:35,952 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:36,063 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:36,400 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:37,603 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:37,642 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:38,343 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:38,764 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:39,279 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:39,534 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:40,513 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:41,275 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:41,815 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:42,309 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:42,437 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:42,933 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:43,157 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:45,237 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:45,463 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:45,480 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:47,039 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:47,211 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:47,708 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:48,054 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:50,469 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:50,500 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:50,857 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:50,995 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:51,513 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:52,638 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:52,743 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:54,371 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:54,920 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:56,186 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:56,442 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:56,573 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:57,630 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:57,939 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:14:58,855 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:00,158 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:00,336 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:01,376 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:01,745 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:02,322 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:02,799 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:02,987 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:03,913 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:04,841 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:04,916 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:05,938 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:07,171 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:07,538 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:08,331 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:08,450 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:08,890 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:09,244 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:10,100 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:10,193 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:11,819 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:12,556 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:12,689 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:12,844 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:12,853 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:15,530 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:16,004 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:16,583 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,095 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,100 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,141 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,185 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,299 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:18,738 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:19,249 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:19,507 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:19,520 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:20,020 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:20,669 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:21,432 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:21,656 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:21,877 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:23,523 - httpx - INFO - HTTP Request: POST https://vip.apiyi.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-06-02 17:15:23,594 - src.metrics - INFO - Metrics computed successfully
2025-06-02 17:15:23,595 - src.evaluator - INFO - Evaluation completed successfully
2025-06-02 17:15:23,731 - __main__ - INFO - Model claude-sonnet-4-20250514 evaluation completed. Results saved to results/20250602_1706/claude-sonnet-4-20250514.json
2025-06-02 17:15:24,077 - root - INFO - Summary saved to results/20250602_1706/summary.json
2025-06-02 17:15:24,077 - root - INFO - CSV summary saved to results/20250602_1706/summary.csv
2025-06-02 17:15:24,084 - __main__ - INFO - Summary saved to results/20250602_1706/summary.json
2025-06-02 17:15:24,085 - __main__ - INFO - Multi-model evaluation completed successfully

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:30:42.329641",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:30:40.296801",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:31:38.361064",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:31:30.382105",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,60 +0,0 @@
{
"timestamp": "2025-05-28T15:31:38.366535",
"models_count": 2,
"models": {
"qwen-max-2025-01-25": {
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
},
"data_count": 10
},
"gpt-4o": {
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
},
"data_count": 10
}
},
"comparison": {
"accuracy": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"precision_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"recall_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"f1_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"precision_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"recall_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"f1_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
}
}
}

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:36:05.524328",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,202 +0,0 @@
[
{
"index": 0,
"question": "Copper is an element that is used in electrical wires. What is the smallest unit of copper that still maintains the characteristics of copper?",
"choices": {
"text": [
"the atom",
"the electron",
"the nucleus",
"the proton"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 1,
"question": "Which statement correctly describes a property of a type of matter?",
"choices": {
"text": [
"Air is a mixture of gases.",
"Ice is a mixture of gases.",
"Air is a liquid.",
"Ice is a liquid."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 2,
"question": "Which statement best explains why a tree branch floats on water?",
"choices": {
"text": [
"Wood is porous.",
"Wood is buoyant.",
"Wood is light.",
"Wood is magnetic."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 3,
"question": "The best way to separate salt from water is with the use of",
"choices": {
"text": [
"oil.",
"heat.",
"a magnet.",
"rubbing alcohol."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 4,
"question": "The speed of a sound wave varies as it travels through different substances. Which factor will most affect the speed of a sound wave?",
"choices": {
"text": [
"the frequency of the wave",
"the wavelength of the wave",
"the source that created the sound",
"the distance between molecules in the medium"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]D[/ANSWER]",
"llm_answer": "[ANSWER]D[/ANSWER]"
},
{
"index": 5,
"question": "Some students are performing hardness tests on several substances. X scratches Y. Y scratches Z. Z scratches W. Which of these statements best describes substance W's hardness?",
"choices": {
"text": [
"W is the softest of the four substances tested.",
"W is the hardest of the four substances tested.",
"W can scratch Y.",
"W can scratch X."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]A[/ANSWER]",
"llm_answer": "[ANSWER]A[/ANSWER]"
},
{
"index": 6,
"question": "When the temperature of a sample of 25 water is -5°C, the water is",
"choices": {
"text": [
"a gas.",
"a liquid.",
"a solid.",
"a vapor."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 7,
"question": "Which is most useful to a student who is separating aluminum screws from steel screws?",
"choices": {
"text": [
"a large funnel",
"a screen filter",
"a horseshoe magnet",
"a magnifying glass"
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]C[/ANSWER]",
"llm_answer": "[ANSWER]C[/ANSWER]"
},
{
"index": 8,
"question": "How are sedimentary rocks made?",
"choices": {
"text": [
"Magma or lava is cooled.",
"Materials are pressed together.",
"Chemical reactions change minerals.",
"Earthquakes cause small pieces to fall."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
},
{
"index": 9,
"question": "A polished metal ball looks very shiny and bright on a sunny day. What makes the ball look shiny?",
"choices": {
"text": [
"The ball makes light.",
"The ball reflects light.",
"The ball absorbs light and then releases it.",
"The ball absorbs light and keeps it inside."
],
"label": [
"A",
"B",
"C",
"D"
]
},
"answer": "[ANSWER]B[/ANSWER]",
"llm_answer": "[ANSWER]B[/ANSWER]"
}
]

View File

@@ -1,12 +0,0 @@
{
"timestamp": "2025-05-28T15:36:03.466534",
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
}
}

View File

@@ -1,3 +0,0 @@
Model,accuracy,precision_micro,recall_micro,f1_micro,precision_macro,recall_macro,f1_macro,Data Count
qwen-max-2025-01-25,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
gpt-4o,1.0,1.0,1.0,1.0,1.0,1.0,1.0,10
1 Model accuracy precision_micro recall_micro f1_micro precision_macro recall_macro f1_macro Data Count
2 qwen-max-2025-01-25 1.0 1.0 1.0 1.0 1.0 1.0 1.0 10
3 gpt-4o 1.0 1.0 1.0 1.0 1.0 1.0 1.0 10

View File

@@ -1,60 +0,0 @@
{
"timestamp": "2025-05-28T15:36:05.540751",
"models_count": 2,
"models": {
"qwen-max-2025-01-25": {
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
},
"data_count": 10
},
"gpt-4o": {
"metrics": {
"accuracy": 1.0,
"precision_micro": 1.0,
"recall_micro": 1.0,
"f1_micro": 1.0,
"precision_macro": 1.0,
"recall_macro": 1.0,
"f1_macro": 1.0
},
"data_count": 10
}
},
"comparison": {
"accuracy": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"precision_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"recall_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"f1_micro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"precision_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"recall_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
},
"f1_macro": {
"qwen-max-2025-01-25": 1.0,
"gpt-4o": 1.0
}
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
{
"timestamp": "2025-06-02T17:15:23.726253",
"metrics": {
"accuracy": 0.700507614213198,
"precision_micro": 0.6934673366834171,
"recall_micro": 0.700507614213198,
"f1_micro": 0.696969696969697,
"precision_macro": 0.7072180484244438,
"recall_macro": 0.7009183673469388,
"f1_macro": 0.69833034513671
}
}

File diff suppressed because one or more lines are too long

View File

@@ -0,0 +1,12 @@
{
"timestamp": "2025-06-02T17:13:08.707748",
"metrics": {
"accuracy": 0.6700507614213198,
"precision_micro": 0.676923076923077,
"recall_micro": 0.6700507614213198,
"f1_micro": 0.673469387755102,
"precision_macro": 0.6899114693446089,
"recall_macro": 0.6705102040816326,
"f1_macro": 0.6754210676562946
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
{
"timestamp": "2025-06-02T17:10:06.316348",
"metrics": {
"accuracy": 0.5482233502538071,
"precision_micro": 0.5618556701030928,
"recall_micro": 0.5532994923857868,
"f1_micro": 0.5575447570332481,
"precision_macro": 0.5779088050314465,
"recall_macro": 0.5536734693877551,
"f1_macro": 0.5600088997453159
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,12 @@
{
"timestamp": "2025-06-02T17:09:24.216653",
"metrics": {
"accuracy": 0.6446700507614214,
"precision_micro": 0.6336633663366337,
"recall_micro": 0.649746192893401,
"f1_micro": 0.6416040100250626,
"precision_macro": 0.6388760049474336,
"recall_macro": 0.6501020408163265,
"f1_macro": 0.64232342205538
}
}

View File

@@ -0,0 +1,5 @@
Model,accuracy,precision_micro,recall_micro,f1_micro,precision_macro,recall_macro,f1_macro,Data Count
qwen-max-2025-01-25,0.6446700507614214,0.6336633663366337,0.649746192893401,0.6416040100250626,0.6388760049474336,0.6501020408163265,0.64232342205538,197
gpt-4o,0.5482233502538071,0.5618556701030928,0.5532994923857868,0.5575447570332481,0.5779088050314465,0.5536734693877551,0.5600088997453159,197
deepseek-chat,0.6700507614213198,0.676923076923077,0.6700507614213198,0.673469387755102,0.6899114693446089,0.6705102040816326,0.6754210676562946,197
claude-sonnet-4-20250514,0.700507614213198,0.6934673366834171,0.700507614213198,0.696969696969697,0.7072180484244438,0.7009183673469388,0.69833034513671,197
1 Model accuracy precision_micro recall_micro f1_micro precision_macro recall_macro f1_macro Data Count
2 qwen-max-2025-01-25 0.6446700507614214 0.6336633663366337 0.649746192893401 0.6416040100250626 0.6388760049474336 0.6501020408163265 0.64232342205538 197
3 gpt-4o 0.5482233502538071 0.5618556701030928 0.5532994923857868 0.5575447570332481 0.5779088050314465 0.5536734693877551 0.5600088997453159 197
4 deepseek-chat 0.6700507614213198 0.676923076923077 0.6700507614213198 0.673469387755102 0.6899114693446089 0.6705102040816326 0.6754210676562946 197
5 claude-sonnet-4-20250514 0.700507614213198 0.6934673366834171 0.700507614213198 0.696969696969697 0.7072180484244438 0.7009183673469388 0.69833034513671 197

View File

@@ -0,0 +1,98 @@
{
"timestamp": "2025-06-02T17:15:23.737185",
"models_count": 4,
"models": {
"qwen-max-2025-01-25": {
"metrics": {
"accuracy": 0.6446700507614214,
"precision_micro": 0.6336633663366337,
"recall_micro": 0.649746192893401,
"f1_micro": 0.6416040100250626,
"precision_macro": 0.6388760049474336,
"recall_macro": 0.6501020408163265,
"f1_macro": 0.64232342205538
},
"data_count": 197
},
"gpt-4o": {
"metrics": {
"accuracy": 0.5482233502538071,
"precision_micro": 0.5618556701030928,
"recall_micro": 0.5532994923857868,
"f1_micro": 0.5575447570332481,
"precision_macro": 0.5779088050314465,
"recall_macro": 0.5536734693877551,
"f1_macro": 0.5600088997453159
},
"data_count": 197
},
"deepseek-chat": {
"metrics": {
"accuracy": 0.6700507614213198,
"precision_micro": 0.676923076923077,
"recall_micro": 0.6700507614213198,
"f1_micro": 0.673469387755102,
"precision_macro": 0.6899114693446089,
"recall_macro": 0.6705102040816326,
"f1_macro": 0.6754210676562946
},
"data_count": 197
},
"claude-sonnet-4-20250514": {
"metrics": {
"accuracy": 0.700507614213198,
"precision_micro": 0.6934673366834171,
"recall_micro": 0.700507614213198,
"f1_micro": 0.696969696969697,
"precision_macro": 0.7072180484244438,
"recall_macro": 0.7009183673469388,
"f1_macro": 0.69833034513671
},
"data_count": 197
}
},
"comparison": {
"accuracy": {
"qwen-max-2025-01-25": 0.6446700507614214,
"gpt-4o": 0.5482233502538071,
"deepseek-chat": 0.6700507614213198,
"claude-sonnet-4-20250514": 0.700507614213198
},
"precision_micro": {
"qwen-max-2025-01-25": 0.6336633663366337,
"gpt-4o": 0.5618556701030928,
"deepseek-chat": 0.676923076923077,
"claude-sonnet-4-20250514": 0.6934673366834171
},
"recall_micro": {
"qwen-max-2025-01-25": 0.649746192893401,
"gpt-4o": 0.5532994923857868,
"deepseek-chat": 0.6700507614213198,
"claude-sonnet-4-20250514": 0.700507614213198
},
"f1_micro": {
"qwen-max-2025-01-25": 0.6416040100250626,
"gpt-4o": 0.5575447570332481,
"deepseek-chat": 0.673469387755102,
"claude-sonnet-4-20250514": 0.696969696969697
},
"precision_macro": {
"qwen-max-2025-01-25": 0.6388760049474336,
"gpt-4o": 0.5779088050314465,
"deepseek-chat": 0.6899114693446089,
"claude-sonnet-4-20250514": 0.7072180484244438
},
"recall_macro": {
"qwen-max-2025-01-25": 0.6501020408163265,
"gpt-4o": 0.5536734693877551,
"deepseek-chat": 0.6705102040816326,
"claude-sonnet-4-20250514": 0.7009183673469388
},
"f1_macro": {
"qwen-max-2025-01-25": 0.64232342205538,
"gpt-4o": 0.5600088997453159,
"deepseek-chat": 0.6754210676562946,
"claude-sonnet-4-20250514": 0.69833034513671
}
}
}

Binary file not shown.