import json import time from openai import OpenAI import re client = OpenAI( api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d", base_url="https://vip.apiyi.com/v1" ) def load_qa_data(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data # 对问题进行拆分 def split_complex_question(question, answer): prompt = f""" Follow these instructions strictly to perform question decomposition: Input requirements: - Question text: {question} - Answer text: {answer} Output rules: 1. Single issue determination criteria: - Question contains only one clear technical inquiry point - Answer content cannot be divided into independent parts → Return: "It's a single issue." 2. Compound question decomposition criteria (must satisfy all): a) Question contains multiple technically independent sub-questions b) Answer contains independent solution paragraphs corresponding to sub-questions c) Each sub-question's answer does not depend on context from other sub-questions 3. Decomposition format standards: [ {{ "question": "[Complete sub-question 1] (including necessary shared parameters)", "answer": "[Corresponding complete answer]" }}, {{ "question": "[Complete sub-question 2] (including necessary shared parameters)", "answer": "[Corresponding complete answer]" }}, ...... ] Key control points: 1. Context integrity: - Each sub-question must include shared parameters from the original question 2. Answer integrity: - Preserve final calculation results - Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23) 3. Format prohibitions: - No explanatory text additions - No modifications to original technical terminology - Return data must not use Markdown and Latex formats (like \times, \mathrm) - Use scientific notation for data representation """ try: response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are a helpful assistant"}, {"role": "user", "content": prompt} ], stream=False ) result = response.choices[0].message.content.strip() print(result) if "It's a single issue." in result: return 1 else: return json.loads(process_response(result)) except Exception as e: print(f"API调用错误: {e}") # 如果API调用失败,返回原问题 return [{"question": question, "answer": answer}] def process_response(response_text): if response_text.strip().startswith("```json") and response_text.strip().endswith("```"): json_text = response_text.strip()[7:-3].strip() return json_text else: return response_text def process_dataset(data): processed_data = [] idx = 1 total = len(data) for i, item in enumerate(data): print(f"处理第 {i+1}/{total} 条数据...") question = item["question"] answer = item["answer"] split_data = split_complex_question(question, answer) if isinstance(split_data, list): for q_data in split_data: processed_data.append({ "idx": idx, "question": q_data["question"], "answer": q_data["answer"] }) idx += 1 else: # 如果API返回了意外的数据格式作为后备处理 processed_data.append({ "idx": idx, "question": question, "answer": answer }) idx += 1 if (i+1) % 10 == 0: time.sleep(2) return processed_data def save_processed_data(data, output_file): with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def main(): input_file = "/home/ubuntu/50T/fsy/benchmark/only_answer.json" output_file = "single_select.json" data = load_qa_data(input_file) processed_data = process_dataset(data) save_processed_data(processed_data, output_file) print(f"处理完成,结果已保存到 {output_file}") if __name__ == "__main__": main()