130 lines
4.7 KiB
Python
130 lines
4.7 KiB
Python
import json
|
||
import time
|
||
from openai import OpenAI
|
||
import re
|
||
client = OpenAI(
|
||
api_key="sk-oYh3Xrhg8oDY2gW02c966f31C84449Ad86F9Cd9dF6E64a8d",
|
||
base_url="https://vip.apiyi.com/v1"
|
||
)
|
||
|
||
def load_qa_data(file_path):
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
data = json.load(f)
|
||
return data
|
||
|
||
# 对问题进行拆分
|
||
def split_complex_question(question, answer):
|
||
prompt = f"""
|
||
Follow these instructions strictly to perform question decomposition:
|
||
Input requirements:
|
||
- Question text: {question}
|
||
- Answer text: {answer}
|
||
Output rules:
|
||
1. Single issue determination criteria:
|
||
- Question contains only one clear technical inquiry point
|
||
- Answer content cannot be divided into independent parts
|
||
→ Return: "It's a single issue."
|
||
2. Compound question decomposition criteria (must satisfy all):
|
||
a) Question contains multiple technically independent sub-questions
|
||
b) Answer contains independent solution paragraphs corresponding to sub-questions
|
||
c) Each sub-question's answer does not depend on context from other sub-questions
|
||
3. Decomposition format standards:
|
||
[
|
||
{{
|
||
"question": "[Complete sub-question 1] (including necessary shared parameters)",
|
||
"answer": "[Corresponding complete answer]"
|
||
}},
|
||
{{
|
||
"question": "[Complete sub-question 2] (including necessary shared parameters)",
|
||
"answer": "[Corresponding complete answer]"
|
||
}},
|
||
......
|
||
]
|
||
Key control points:
|
||
1. Context integrity:
|
||
- Each sub-question must include shared parameters from the original question
|
||
2. Answer integrity:
|
||
- Preserve final calculation results
|
||
- Maintain original units and precision (e.g., 6.02×10²³ cannot be simplified to 6.02e23)
|
||
|
||
3. Format prohibitions:
|
||
- No explanatory text additions
|
||
- No modifications to original technical terminology
|
||
- Return data must not use Markdown and Latex formats (like \times, \mathrm)
|
||
- Use scientific notation for data representation
|
||
"""
|
||
|
||
try:
|
||
response = client.chat.completions.create(
|
||
model="deepseek-chat",
|
||
messages=[
|
||
{"role": "system", "content": "You are a helpful assistant"},
|
||
{"role": "user", "content": prompt}
|
||
],
|
||
stream=False
|
||
)
|
||
result = response.choices[0].message.content.strip()
|
||
print(result)
|
||
if "It's a single issue." in result:
|
||
return 1
|
||
else:
|
||
return json.loads(process_response(result))
|
||
except Exception as e:
|
||
print(f"API调用错误: {e}")
|
||
# 如果API调用失败,返回原问题
|
||
return [{"question": question, "answer": answer}]
|
||
|
||
def process_response(response_text):
|
||
if response_text.strip().startswith("```json") and response_text.strip().endswith("```"):
|
||
json_text = response_text.strip()[7:-3].strip()
|
||
return json_text
|
||
else:
|
||
return response_text
|
||
|
||
def process_dataset(data):
|
||
processed_data = []
|
||
idx = 1
|
||
total = len(data)
|
||
|
||
for i, item in enumerate(data):
|
||
print(f"处理第 {i+1}/{total} 条数据...")
|
||
question = item["question"]
|
||
answer = item["answer"]
|
||
split_data = split_complex_question(question, answer)
|
||
|
||
if isinstance(split_data, list):
|
||
for q_data in split_data:
|
||
processed_data.append({
|
||
"idx": idx,
|
||
"question": q_data["question"],
|
||
"answer": q_data["answer"]
|
||
})
|
||
idx += 1
|
||
else: # 如果API返回了意外的数据格式作为后备处理
|
||
processed_data.append({
|
||
"idx": idx,
|
||
"question": question,
|
||
"answer": answer
|
||
})
|
||
idx += 1
|
||
|
||
if (i+1) % 10 == 0:
|
||
time.sleep(2)
|
||
|
||
return processed_data
|
||
|
||
def save_processed_data(data, output_file):
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
def main():
|
||
input_file = "/home/ubuntu/50T/fsy/benchmark/only_answer.json"
|
||
output_file = "single_select.json"
|
||
data = load_qa_data(input_file)
|
||
processed_data = process_dataset(data)
|
||
save_processed_data(processed_data, output_file)
|
||
print(f"处理完成,结果已保存到 {output_file}")
|
||
|
||
if __name__ == "__main__":
|
||
main()
|