Files
2025-05-28 11:00:24 +08:00

92 lines
3.0 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
from openai import OpenAI
client = OpenAI(
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
base_url="https://vip.apiyi.com/v1"
)
def load_qa_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# 判断是否为计算题
def classify_qa_type(question, answer):
prompt = f"""
Please analyze the following question and its answer, and classify the question type into one of the following four categories:
1. Calculation: A question that requires mathematical operations to derive the result.
2. Multiple choice: A question that provides multiple options (e.g., A/B/C/D) for the respondent to choose from.
3. True/False: A question that only requires answering true/false, yes/no, or correct/incorrect.
3. Other: A question that does not fall under the above three categories.
Question:
{question}
Answer:
{answer}
Please respond with the corresponding numeric code directly (without any explanation):
2. For Calculation, respond: 1
2. For Multiple choice, respond: 2
3. For True/False, respond: 3
4. For Other, respond: 4
"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt}
],
stream=False
)
result = response.choices[0].message.content.strip().lower()
print(result)
if "1" in result:
return 1
elif "2" in result:
return 2
elif "3" in result:
return 3
else:
return 4
except Exception as e:
print(f"API调用错误: {e}")
# 如果API调用失败默认为非简答题
return 0
# 处理整个数据集并添加标签
def process_dataset(data):
total = len(data)
for i, item in enumerate(data):
print(f"处理第 {i+1}/{total} 条数据...")
question = item["question"]
answer = item["answer"]
label = classify_qa_type(question, answer)
item["type"] = label
if (i+1) % 10 == 0:
time.sleep(2)
return data
# 保存处理后的数据
def save_processed_data(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
input_file = "/home/ubuntu/50T/fsy/benchmark/3single_select.json"
output_file = "4is_type.json"
data = load_qa_data(input_file)
processed_data = process_dataset(data)
save_processed_data(processed_data, output_file)
print(f"处理完成,结果已保存到 {output_file}")
if __name__ == "__main__":
main()