""" 0. 将问题从xls提取为json 1. 将问题进行拆分 2. 翻译成英文 3. 去重 4. 使用大模型进行难度评估和筛选 """ import json import time from openai import OpenAI import re client = OpenAI( api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d", base_url="https://vip.apiyi.com/v1" ) def load_qa_data(file_path): with open(file_path, 'r', encoding='utf-8') as f: data = json.load(f) return data # 对问题进行拆分 ''' 保留计算题的计算过程:- Fully preserve the step-by-step calculation process along with the final results 只保留计算题的结果:- Preserve final calculation results ''' def split_complex_question(question, answer): prompt = f""" Follow these instructions strictly to perform question decomposition: Input requirements: - Question text: {question} - Answer text: {answer} Output rules: 1. Single issue determination criteria: - Question contains only one clear technical inquiry point - Answer content cannot be divided into independent parts → Return: "It's a single issue." 2. Compound question decomposition criteria (must satisfy all): a) Question contains multiple technically independent sub-questions b) Answer contains independent solution paragraphs corresponding to sub-questions c) Each sub-question's answer does not depend on context from other sub-questions 3. Decomposition format standards: [ {{ "question": "[Complete sub-question 1] (including necessary shared parameters)", "answer": "[Corresponding complete answer]" }}, {{ "question": "[Complete sub-question 2] (including necessary shared parameters)", "answer": "[Corresponding complete answer]" }}, ...... ] Key control points: 1. Context integrity: - Each sub-question must include shared parameters from the original question 2. Answer integrity: - Fully preserve the step-by-step calculation process along with the final results - Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23) 3. 3. Format prohibitions: - No explanatory text additions - No modifications to original technical terminology - Return data must not use Markdown and Latex formats (like \times, \mathrm) - Use scientific notation for data representation """ try: response = client.chat.completions.create( model="deepseek-chat", messages=[ {"role": "system", "content": "You are an expert in decomposing complex technical questions into independent sub-questions and providing corresponding complete answers with preserved context, precision, and technical terminology. "}, {"role": "user", "content": prompt} ], stream = False, temperature = 0 ) result = response.choices[0].message.content.strip() print(result) if "It's a single issue." in result: return 1 else: return json.loads(process_response(result)) except Exception as e: print(f"API调用错误: {e}") # 如果API调用失败,返回原问题 return [{"question": question, "answer": answer}] def process_response(response): """Extract and parse JSON from a response.""" json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response) json_str = json_match.group(1) if json_match else response.strip() json_str = re.sub(r'(\$[^\$]*\$)', lambda m: m.group(1).replace('\\', '\\\\'), json_str) json_str = json_str.replace('\\"', '"').replace("\\'", "'") return json_str def process_dataset(data): processed_data = [] total = len(data) for i, item in enumerate(data): print(f"处理第 {i+1}/{total} 条数据...") question = item["question"] answer = item["answer"] split_data = split_complex_question(question, answer) if isinstance(split_data, list): for q_data in split_data: processed_data.append({ "idx":item["idx"], "question": q_data["question"], "answer": q_data["answer"] }) else: processed_data.append({ "idx":item["idx"], "question": question, "answer": answer }) if (i+1) % 10 == 0: time.sleep(2) return processed_data def save_processed_data(data, output_file): with open(output_file, 'w', encoding='utf-8') as f: json.dump(data, f, ensure_ascii=False, indent=2) def main(): input_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821.json" output_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select_includes_process.json" data = load_qa_data(input_file) processed_data = process_dataset(data) save_processed_data(processed_data, output_file) print(f"处理完成,结果已保存到 {output_file}") if __name__ == "__main__": main()