92 lines
2.9 KiB
Python
92 lines
2.9 KiB
Python
import json
|
||
import time
|
||
from openai import OpenAI
|
||
|
||
|
||
client = OpenAI(
|
||
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
|
||
base_url="https://vip.apiyi.com/v1"
|
||
)
|
||
|
||
def load_qa_data(file_path):
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
return data
|
||
|
||
# 处理计算题,仅保留计算结果。
|
||
def classify_qa_type(question, answer):
|
||
prompt = f"""
|
||
Process the given `question` and `answer` data, retaining the question and its corresponding answer while removing the calculation steps.
|
||
|
||
Question:
|
||
{question}
|
||
|
||
Original Answer:
|
||
{answer}
|
||
|
||
Requirements:
|
||
1. In the answer section, keep only the final result and its corresponding unit, removing any calculation steps.
|
||
2. If the answer involves multiple parts, use clear paragraph breaks or numbering to distinguish them.
|
||
|
||
Note:
|
||
- If the original answer contains LaTeX formulas (e.g., `\\(6.02 \times 10^{23}\\)`), preserve the formula format but remove irrelevant derivation symbols (e.g., `\mathrm`).
|
||
|
||
- Output only the processed answer content.
|
||
"""
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model="deepseek-chat", # DeepSeek-v3模型
|
||
messages=[
|
||
{"role": "system", "content": "You are a helpful assistant"},
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
stream=False
|
||
)
|
||
result = response.choices[0].message.content.strip().lower()
|
||
return result
|
||
# if "1" in result:
|
||
# print("1")
|
||
# return 1
|
||
# else:
|
||
# print("0")
|
||
# return 0
|
||
except Exception as e:
|
||
print(f"API调用错误: {e}")
|
||
# 如果API调用失败,默认为非简答题
|
||
return 0
|
||
|
||
# 处理整个数据集并添加标签
|
||
def process_dataset(data):
|
||
total = len(data)
|
||
for i, item in enumerate(data):
|
||
print(f"处理第 {i+1}/{total} 条数据...")
|
||
question = item["question"]
|
||
answer = item["answer"]
|
||
sel = item["is_select"]
|
||
if sel == 1 :
|
||
a1 = classify_qa_type(question, answer)
|
||
print(a1)
|
||
item["answer"] = a1
|
||
|
||
if (i+1) % 10 == 0:
|
||
time.sleep(2)
|
||
|
||
return data
|
||
|
||
# 保存处理后的数据
|
||
def save_processed_data(data, output_file):
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
def main():
|
||
input_file = "/home/ubuntu/50T/fsy/benchmark/is_select.json"
|
||
output_file = "only_answer.json"
|
||
data = load_qa_data(input_file)
|
||
processed_data = process_dataset(data)
|
||
save_processed_data(processed_data, output_file)
|
||
print(f"处理完成,结果已保存到 {output_file}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|