second commit

This commit is contained in:
lzy
2025-05-28 10:55:34 +08:00
parent 0f80316f8b
commit ef9355f2f5
73 changed files with 485583 additions and 0 deletions

View File

@@ -0,0 +1,141 @@
"""
0. 将问题从xls提取为json
1. 将问题进行拆分
2. 翻译成英文
3. 去重
4. 使用大模型进行难度评估和筛选
"""
import json
import time
from openai import OpenAI
import re
client = OpenAI(
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
base_url="https://vip.apiyi.com/v1"
)
def load_qa_data(file_path):
with open(file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
return data
# 对问题进行拆分
'''
保留计算题的计算过程:- Fully preserve the step-by-step calculation process along with the final results
只保留计算题的结果:- Preserve final calculation results
'''
def split_complex_question(question, answer):
prompt = f"""
Follow these instructions strictly to perform question decomposition:
Input requirements:
- Question text: {question}
- Answer text: {answer}
Output rules:
1. Single issue determination criteria:
- Question contains only one clear technical inquiry point
- Answer content cannot be divided into independent parts
→ Return: "It's a single issue."
2. Compound question decomposition criteria (must satisfy all):
a) Question contains multiple technically independent sub-questions
b) Answer contains independent solution paragraphs corresponding to sub-questions
c) Each sub-question's answer does not depend on context from other sub-questions
3. Decomposition format standards:
[
{{
"question": "[Complete sub-question 1] (including necessary shared parameters)",
"answer": "[Corresponding complete answer]"
}},
{{
"question": "[Complete sub-question 2] (including necessary shared parameters)",
"answer": "[Corresponding complete answer]"
}},
......
]
Key control points:
1. Context integrity:
- Each sub-question must include shared parameters from the original question
2. Answer integrity:
- Fully preserve the step-by-step calculation process along with the final results
- Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)
3.
3. Format prohibitions:
- No explanatory text additions
- No modifications to original technical terminology
- Return data must not use Markdown and Latex formats (like \times, \mathrm)
- Use scientific notation for data representation
"""
try:
response = client.chat.completions.create(
model="deepseek-chat",
messages=[
{"role": "system", "content": "You are an expert in decomposing complex technical questions into independent sub-questions and providing corresponding complete answers with preserved context, precision, and technical terminology. "},
{"role": "user", "content": prompt}
],
stream = False,
temperature = 0
)
result = response.choices[0].message.content.strip()
print(result)
if "It's a single issue." in result:
return 1
else:
return json.loads(process_response(result))
except Exception as e:
print(f"API调用错误: {e}")
# 如果API调用失败返回原问题
return [{"question": question, "answer": answer}]
def process_response(response):
"""Extract and parse JSON from a response."""
json_match = re.search(r'```(?:json)?\s*([\s\S]*?)\s*```', response)
json_str = json_match.group(1) if json_match else response.strip()
json_str = re.sub(r'(\$[^\$]*\$)', lambda m: m.group(1).replace('\\', '\\\\'), json_str)
json_str = json_str.replace('\\"', '"').replace("\\'", "'")
return json_str
def process_dataset(data):
processed_data = []
total = len(data)
for i, item in enumerate(data):
print(f"处理第 {i+1}/{total} 条数据...")
question = item["question"]
answer = item["answer"]
split_data = split_complex_question(question, answer)
if isinstance(split_data, list):
for q_data in split_data:
processed_data.append({
"idx":item["idx"],
"question": q_data["question"],
"answer": q_data["answer"]
})
else:
processed_data.append({
"idx":item["idx"],
"question": question,
"answer": answer
})
if (i+1) % 10 == 0:
time.sleep(2)
return processed_data
def save_processed_data(data, output_file):
with open(output_file, 'w', encoding='utf-8') as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main():
input_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821.json"
output_file = "/home/ubuntu/50T/fsy/layer2/QA/code/821_single_select_includes_process.json"
data = load_qa_data(input_file)
processed_data = process_dataset(data)
save_processed_data(processed_data, output_file)
print(f"处理完成,结果已保存到 {output_file}")
if __name__ == "__main__":
main()