194 lines
5.9 KiB
Python
194 lines
5.9 KiB
Python
import json
|
||
import time
|
||
import os
|
||
import asyncio
|
||
from concurrent.futures import ThreadPoolExecutor
|
||
from openai import OpenAI
|
||
import numpy as np
|
||
from tqdm import tqdm
|
||
from prompts import CLEAN_PROMPTS,SELECT_QUESTION_PROMPT
|
||
|
||
API_KEY=""
|
||
BASE_URL="https://vip.apiyi.com/v1"
|
||
MODEL_GPT="text-embedding-ada-002"
|
||
MODELS = ["deepseek-reasoner", "claude-3-7-sonnet-20250219", "qwen-max", "deepseek-chat", "gemini-pro"]
|
||
|
||
def load_data(file_path):
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
return data
|
||
|
||
# 判断问题是否完整
|
||
def check_question_completeness(question,answer):
|
||
try:
|
||
client = OpenAI(api_key= API_KEY, base_url= BASE_URL)
|
||
response = client.chat.completions.create(
|
||
model="gpt-4-turbo",
|
||
messages=[
|
||
{"role": "system", "content": ""},
|
||
{"role": "user", "content": CLEAN_PROMPTS.replace("{QUESTION}", question).replace("{ANSWER}", answer)}
|
||
],
|
||
temperature=0.0
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
# 尝试提取数字结果
|
||
if "1" in result:
|
||
return 1
|
||
else:
|
||
return 0
|
||
except Exception as e:
|
||
print(f"Error checking question completeness: {e}")
|
||
return 0
|
||
|
||
# 对问题进行难度打分
|
||
def score_question_difficulty(model_name, question, answer):
|
||
try:
|
||
client = OpenAI(api_key= API_KEY, base_url= BASE_URL)
|
||
response = client.chat.completions.create(
|
||
model = model_name,
|
||
messages=[
|
||
{"role": "system", "content": ""},
|
||
{"role": "user", "content": SELECT_QUESTION_PROMPT.replace("{QUESTION}", question).replace("{ANSWER}", answer)}
|
||
],
|
||
temperature=0.2
|
||
)
|
||
|
||
result = response.choices[0].message.content.strip()
|
||
|
||
# 尝试从响应中提取JSON
|
||
try:
|
||
# 查找JSON开始和结束的位置
|
||
start_idx = result.find('{')
|
||
end_idx = result.rfind('}') + 1
|
||
|
||
if start_idx >= 0 and end_idx > start_idx:
|
||
json_str = result[start_idx:end_idx]
|
||
json_result = json.loads(json_str)
|
||
return json_result.get("score", 0)
|
||
else:
|
||
# 如果无法找到JSON,尝试直接从文本中提取分数
|
||
import re
|
||
score_match = re.search(r'"score":\s*(\d+)', result)
|
||
if score_match:
|
||
return int(score_match.group(1))
|
||
else:
|
||
print(f"无法解析模型返回的分数: {result}")
|
||
return 0
|
||
except Exception as e:
|
||
print(f"解析JSON发生错误: {e}")
|
||
print(f"原始响应: {result}")
|
||
return 0
|
||
|
||
except Exception as e:
|
||
print(f"模型{model_name}评分出错: {e}")
|
||
time.sleep(5) # 出错后暂停
|
||
return 0
|
||
|
||
# 异步处理单个问题
|
||
async def process_question(data_item, executor):
|
||
idx = data_item["idx"]
|
||
question = data_item["question"]
|
||
answer = data_item["answer"]
|
||
|
||
# 首先判断问题是否完整
|
||
is_complete = check_question_completeness(question, answer)
|
||
|
||
if is_complete != 1:
|
||
return None
|
||
|
||
# 使用线程池并行评分
|
||
scores = {}
|
||
loop = asyncio.get_event_loop()
|
||
score_tasks = []
|
||
|
||
for model in MODELS:
|
||
score_tasks.append(
|
||
loop.run_in_executor(
|
||
executor,
|
||
score_question_difficulty,
|
||
model,
|
||
question,
|
||
answer,
|
||
idx
|
||
)
|
||
)
|
||
|
||
# 获取所有评分结果
|
||
model_scores = await asyncio.gather(*score_tasks)
|
||
|
||
# 合并评分结果
|
||
total_score = 0
|
||
for i, model in enumerate(MODELS):
|
||
scores[model] = model_scores[i]
|
||
total_score += model_scores[i]
|
||
|
||
# 构建结果
|
||
result = {
|
||
"id": idx,
|
||
"question": question,
|
||
"answer": answer,
|
||
"total_score": total_score
|
||
}
|
||
|
||
# 添加各模型评分
|
||
for model in MODELS:
|
||
result[model] = scores[model]
|
||
|
||
return result
|
||
|
||
# 保存结果到文件
|
||
def save_results(results, output_file):
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(results, f, ensure_ascii=False, indent=2)
|
||
|
||
# 主处理函数
|
||
async def main(input_file, output_file, score_file, top_n=2000):
|
||
# 加载数据
|
||
data = load_data(input_file)
|
||
|
||
results = []
|
||
with ThreadPoolExecutor(max_workers=5) as executor:
|
||
tasks = []
|
||
for item in data:
|
||
tasks.append(process_question(item, executor))
|
||
|
||
# 显示进度条
|
||
for f in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Processing questions"):
|
||
result = await f
|
||
if result is not None:
|
||
results.append(result)
|
||
|
||
# 按总分排序
|
||
results.sort(key=lambda x: x["total_score"], reverse=True)
|
||
|
||
# 选取前top_n个
|
||
top_results = results[:top_n]
|
||
|
||
# 保存结果
|
||
save_results(top_results, output_file)
|
||
|
||
# 保存评分结果
|
||
score_results = []
|
||
for item in results:
|
||
score_item = {
|
||
"id": item["id"],
|
||
"question": item["question"],
|
||
"answer": item["answer"]
|
||
}
|
||
# 添加各模型得分
|
||
for model in MODELS:
|
||
score_item[model] = item[model]
|
||
|
||
score_results.append(score_item)
|
||
|
||
save_results(score_results, score_file)
|
||
|
||
print(f"处理完成。共有{len(results)}道完整问题,已选取前{len(top_results)}道最难问题。")
|
||
|
||
if __name__ == "__main__":
|
||
input_file = "input.json" # 输入的JSON文件
|
||
output_file = "top_difficult_questions.json" # 输出前2000道最难问题
|
||
score_file = "scores.json" # 保存所有模型评分
|
||
|
||
# 运行主函数
|
||
asyncio.run(main(input_file, output_file, score_file, 2000)) |