Files
2025-05-28 11:00:24 +08:00

92 lines
2.9 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import json
import time
from openai import OpenAI
client = OpenAI(
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
base_url="https://vip.apiyi.com/v1"
)
def load_qa_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# 处理计算题,仅保留计算结果。
def classify_qa_type(question, answer):
prompt = f"""
Process the given `question` and `answer` data, retaining the question and its corresponding answer while removing the calculation steps.
Question:
{question}
Original Answer:
{answer}
Requirements:
1. In the answer section, keep only the final result and its corresponding unit, removing any calculation steps.
2. If the answer involves multiple parts, use clear paragraph breaks or numbering to distinguish them.
Note:
- If the original answer contains LaTeX formulas (e.g., `\\(6.02 \times 10^{23}\\)`), preserve the formula format but remove irrelevant derivation symbols (e.g., `\mathrm`).
- Output only the processed answer content.
"""
try:
response = client.chat.completions.create(
model="deepseek-chat", # DeepSeek-v3模型
messages=[
{"role": "system", "content": "You are a helpful assistant"},
{"role": "user", "content": prompt}
],
stream=False
)
result = response.choices[0].message.content.strip().lower()
return result
# if "1" in result:
# print("1")
# return 1
# else:
# print("0")
# return 0
except Exception as e:
print(f"API调用错误: {e}")
# 如果API调用失败默认为非简答题
return 0
# 处理整个数据集并添加标签
def process_dataset(data):
total = len(data)
for i, item in enumerate(data):
print(f"处理第 {i+1}/{total} 条数据...")
question = item["question"]
answer = item["answer"]
sel = item["is_select"]
if sel == 1 :
a1 = classify_qa_type(question, answer)
print(a1)
item["answer"] = a1
if (i+1) % 10 == 0:
time.sleep(2)
return data
# 保存处理后的数据
def save_processed_data(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
input_file = "/home/ubuntu/50T/fsy/benchmark/is_select.json"
output_file = "only_answer.json"
data = load_qa_data(input_file)
processed_data = process_dataset(data)
save_processed_data(processed_data, output_file)
print(f"处理完成,结果已保存到 {output_file}")
if __name__ == "__main__":
main()