Files
MatBench/layer2/PGEE/code/step1_single_question.py
2025-05-28 10:55:34 +08:00

142 lines
5.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
0. 将问题从xls提取为json
1. 将问题进行拆分
2. 翻译成英文
3. 去重
4. 使用大模型进行难度评估和筛选
"""
import json
import time
from openai import OpenAI
import re
client = OpenAI(
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
base_url="https://vip.apiyi.com/v1"
)
def load_qa_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# 对问题进行拆分
'''
保留计算题的计算过程:- Fully preserve the step-by-step calculation process along with the final results
只保留计算题的结果:- Preserve final calculation results
'''
def split_complex_question(question, answer):
prompt = f"""
Follow these instructions strictly to perform question decomposition:
Input requirements:
- Question text: {question}
- Answer text: {answer}
Output rules:
1. Single issue determination criteria:
- Question contains only one clear technical inquiry point
- Answer content cannot be divided into independent parts
→ Return: "It's a single issue."
2. Compound question decomposition criteria (must satisfy all):
a) Question contains multiple technically independent sub-questions
b) Answer contains independent solution paragraphs corresponding to sub-questions
c) Each sub-question's answer does not depend on context from other sub-questions
3. Decomposition format standards:
[
{{
"question": "[Complete sub-question 1] (including necessary shared parameters)",
"answer": "[Corresponding complete answer]"
}},
{{
"question": "[Complete sub-question 2] (including necessary shared parameters)",
"answer": "[Corresponding complete answer]"
}},
......
]
Key control points:
1. Context integrity:
- Each sub-question must include shared parameters from the original question
2. Answer integrity:
- Fully preserve the step-by-step calculation process along with the final results
- Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)
3.
3. Format prohibitions:
- No explanatory text additions
- No modifications to original technical terminology
- Return data must not use Markdown and Latex formats (like \times, \mathrm)
- Use scientific notation for data representation
"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are an expert in decomposing complex technical questions into independent sub-questions and providing corresponding complete answers with preserved context, precision, and technical terminology. "},
{"role": "user", "content": prompt}
],
stream = False,
temperature = 0
)
result = response.choices[0].message.content.strip()
print(result)
if "It's a single issue." in result:
return 1
else:
return json.loads(process_response(result))
except Exception as e:
print(f"API调用错误: {e}")
# 如果API调用失败返回原问题
return [{"question": question, "answer": answer}]
def process_response(response):
"""Extract and parse JSON from a response."""
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response)
json_str = json_match.group(1) if json_match else response.strip()
json_str = re.sub(r'(\$[^\$]*\$)', lambda m: m.group(1).replace('\\', '\\\\'), json_str)
json_str = json_str.replace('\\"', '"').replace("\\'", "'")
return json_str
def process_dataset(data):
processed_data = []
total = len(data)
for i, item in enumerate(data):
print(f"处理第 {i+1}/{total} 条数据...")
question = item["question"]
answer = item["answer"]
split_data = split_complex_question(question, answer)
if isinstance(split_data, list):
for q_data in split_data:
processed_data.append({
"idx":item["idx"],
"question": q_data["question"],
"answer": q_data["answer"]
})
else:
processed_data.append({
"idx":item["idx"],
"question": question,
"answer": answer
})
if (i+1) % 10 == 0:
time.sleep(2)
return processed_data
def save_processed_data(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
input_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821.json"
output_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select_includes_process.json"
data = load_qa_data(input_file)
processed_data = process_dataset(data)
save_processed_data(processed_data, output_file)
print(f"处理完成,结果已保存到 {output_file}")
if __name__ == "__main__":
main()