import json import time from openai import OpenAI client = OpenAI( api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d", base_url="https://vip.apiyi.com/v1" ) def load_qa_data(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data # 处理计算题,仅保留计算结果。 def classify_qa_type(question, answer): prompt = f""" Process the given `question` and `answer` data, retaining the question and its corresponding answer while removing the calculation steps. Question: {question} Original Answer: {answer} Requirements: 1. In the answer section, keep only the final result and its corresponding unit, removing any calculation steps. 2. If the answer involves multiple parts, use clear paragraph breaks or numbering to distinguish them. Note: - If the original answer contains LaTeX formulas (e.g., `\\(6.02 \times 10^{23}\\)`), preserve the formula format but remove irrelevant derivation symbols (e.g., `\mathrm`). - Output only the processed answer content. """ try: response = client.chat.completions.create( model="deepseek-chat", # DeepSeek-v3模型 messages=[ {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": prompt} ], stream=False ) result = response.choices[0].message.content.strip().lower() return result # if "1" in result: # print("1") # return 1 # else: # print("0") # return 0 except Exception as e: print(f"API调用错误: {e}") # 如果API调用失败,默认为非简答题 return 0 # 处理整个数据集并添加标签 def process_dataset(data): total = len(data) for i, item in enumerate(data): print(f"处理第 {i+1}/{total} 条数据...") question = item["question"] answer = item["answer"] sel = item["is_select"] if sel == 1 : a1 = classify_qa_type(question, answer) print(a1) item["answer"] = a1 if (i+1) % 10 == 0: time.sleep(2) return data # 保存处理后的数据 def save_processed_data(data, output_file): with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def main(): input_file = "/home/ubuntu/50T/fsy/benchmark/is_select.json" output_file = "only_answer.json" data = load_qa_data(input_file) processed_data = process_dataset(data) save_processed_data(processed_data, output_file) print(f"处理完成,结果已保存到 {output_file}") if __name__ == "__main__": main()