mars-mcp/generate_data/generate_sft_data/sft_utils.py

import jsonlines
import argparse
import generate_data.generate_sft_data.utils as utils
import glob
import json
from ase import io
import tempfile
import re
from pymatgen.io.vasp import Poscar
from pymatgen.io.cif import CifParser
import threading
import concurrent.futures

# Create a lock for file writing
file_lock = threading.Lock()


def generate_design_question(crystal_desc, cif_info, crystal_props, max_retries=3, initial_backoff=1.0):
    instruction = """
{crystal_desc}

### 对应的晶体结构数据(CIF)如下：
{cif_info}

### 该晶体结构的物理化学性质为：
{crystal_props}

根据如上信息，我现在需要给材料科学的博士考试出题，问题要求博士们回答出上文中的完整CIF文件，如果是你你会如何出题？
也就是说，要求我们提出的问题的答案是上文中提及的完整CIF文件。当然，你的问题必须给定充足的该晶体结构的相关信息。
但是相关信息应该抽象和隐晦，避免过于直白，除明确的化学表达式外，尽量避免过多的精确信息，让博士考生们可以通过推理得到某些信息以增加问题的难度。
问题的语言一半是中文，一半是英文，以便更好地与模型进行交互。

请先生成10个问题示例，再挑选2个最好的问题示例并遵循如下格式输出：
```json
{
  "selected_questions": [
    {
      "question_id": 1,
      "question_text": "问题1的完整内容...",
    },
    {
      "question_id": 2,
      "question_text": "问题2的完整内容...",
    }
  ]
}
"""
    instruction = instruction.replace("{crystal_desc}", crystal_desc).replace("{cif_info}", cif_info).replace("{crystal_props}", crystal_props)
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": instruction}
    ]
    import time
    start_time = time.time()
    _response = utils.get_response_from_llm(messages, model_name="deepseek-v3", max_retries=max_retries, initial_backoff=initial_backoff)
    # reasoning_content, _response = utils.get_response_from_deepseek_r1(messages, max_retries=max_retries, initial_backoff=initial_backoff)
    # print(f"Time: {time.time() - start_time}")
    if _response == 'apierror' or _response == 'unexpectederror':
        return _response
    # 尝试从响应中提取JSON部分
    json_match = re.search(r'```json\s*(.*?)\s*```', _response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        try:
            questions_data = json.loads(json_str)
            return questions_data
        except json.JSONDecodeError:
            # 如果JSON解析失败，尝试清理字符串后再次解析
            cleaned_json = re.sub(r'[\n\r\t]', '', json_str)
            try:
                questions_data = json.loads(cleaned_json)
                return questions_data
            except:
                return {"error": "Failed to parse JSON response", "raw_response": _response}
    else:
        # 如果没有找到JSON格式，返回原始响应
        return {"error": "No JSON format found in response", "raw_response": _response}


def generate_props_question(crystal_desc, cif_info, crystal_props, max_retries=3, initial_backoff=1.0):
    instruction = """
{crystal_desc}

### 对应的晶体结构数据(CIF)如下：
{cif_info}

### 该晶体结构的物理化学性质为：
{crystal_props}

根据如上信息，我现在需要给材料科学的博士考试出题，问题要求博士们根据CIF文件回答出上文中的物理化学性质，如果是你你会如何出题？
也就是说，要求我们提出的问题的答案是上文中提及的物理化学性质。当然，你的问题必须尽量包含一个<placeholder>标签代表给定的CIF文件。
让博士考生们根据给定的CIF文件通过深入思考和推理去分析该种晶体材料在上文所提及的全部物理化学性质，并用JSON格式回答全部的物理化学性质。
问题的语言一半是中文，一半是英文，以便更好地与模型进行交互。

示例的问题：
1. <placeholder>\n，根据上文提供的CIF文件，请你xxx
2. 根据下文提供的CIF文件，请你xxx\n <<placeholder>>

请先生成10个问题示例，再挑选2个最好的问题示例并遵循如下格式输出：
```json
{
  "selected_questions": [
    {
      "question_id": 1,
      "question_text": "问题1的完整内容...",
    },
    {
      "question_id": 2,
      "question_text": "问题2的完整内容...",
    }
  ]
}
```
"""
    instruction = instruction.replace("{crystal_desc}", crystal_desc).replace("{cif_info}", cif_info).replace("{crystal_props}", crystal_props)
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": instruction}
    ]
    import time
    start_time = time.time()
    _response = utils.get_response_from_llm(messages, model_name="deepseek-v3", max_retries=max_retries, initial_backoff=initial_backoff)
    # reasoning_content, _response = utils.get_response_from_deepseek_r1(messages, max_retries=max_retries, initial_backoff=initial_backoff)
    # print(f"Time: {time.time() - start_time}")
    if _response == 'apierror' or _response == 'unexpectederror':
        return _response
    # 尝试从响应中提取JSON部分
    json_match = re.search(r'```json\s*(.*?)\s*```', _response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        try:
            questions_data = json.loads(json_str)
            return questions_data
        except json.JSONDecodeError:
            # 如果JSON解析失败，尝试清理字符串后再次解析
            cleaned_json = re.sub(r'[\n\r\t]', '', json_str)
            try:
                questions_data = json.loads(cleaned_json)
                return questions_data
            except:
                return {"error": "Failed to parse JSON response", "raw_response": _response}
    else:
        # 如果没有找到JSON格式，返回原始响应
        return {"error": "No JSON format found in response", "raw_response": _response}


def generate_papers_other_question(paper_info, max_retries=3, initial_backoff=1.0):
    instruction = """
{paper_info}

根据如上信息，我现在需要给材料科学的博士学生出题，问题要求考察博士对该材料的反应方程式、结构、性能和应用是否完全掌握，如果是你你会怎么出题？
你的问题里面应该包含该材料相关的合适的信息，且是自包含的（在只有问题的情况下问题中的关键信息不遗漏），但问题需要有难度和深度，需要博士生们深入思考和推理后才能作为准确的回答。
由于问题面向博士，因此，提出的问题需要一定的科研价值导向。涉及到反应方程式、关于结构、性能和应用等方面的具体试剂量等信息时，要求他们尽可能给出精确的数值（前提是这些数值在上文中存在）。


请先生成12个问题示例，12个问题的语言一半是中文，一半是英文，再挑选4个最好的问题示例并遵循如下格式输出：
```json
{
  "selected_questions": [
    {
      "question_id": 1,
      "question_text": "问题1的完整内容...",
      "question_type": "问题1的类型", # reaction_string; structure; performence; application
    },
    {
      "question_id": 2,
      "question_text": "问题2的完整内容...",
     "question_type": "问题1的类型", # reaction_string; structure; performence; application
    }, ...
  ]
}
"""
    instruction = instruction.replace("{paper_info}", paper_info)
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": instruction}
    ]
    import time
    start_time = time.time()
    _response = utils.get_response_from_llm(messages, model_name="deepseek-v3", max_retries=max_retries, initial_backoff=initial_backoff)
    # reasoning_content, _response = utils.get_response_from_deepseek_r1(messages, max_retries=max_retries, initial_backoff=initial_backoff)
    # print(f"Time: {time.time() - start_time}")
    if _response == 'apierror' or _response == 'unexpectederror':
        return _response
    # 尝试从响应中提取JSON部分
    json_match = re.search(r'```json\s*(.*?)\s*```', _response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        try:
            questions_data = json.loads(json_str)
            return questions_data
        except json.JSONDecodeError:
            # 如果JSON解析失败，尝试清理字符串后再次解析
            cleaned_json = re.sub(r'[\n\r\t]', '', json_str)
            try:
                questions_data = json.loads(cleaned_json)
                return questions_data
            except:
                return {"error": "Failed to parse JSON response", "raw_response": _response}
    else:
        # 如果没有找到JSON格式，返回原始响应
        return {"error": "No JSON format found in response", "raw_response": _response}


def generate_papers_synthesis_question(paper_info, max_retries=3, initial_backoff=1.0):
    instruction = """
{paper_info}

根据如上信息，我现在需要给材料科学的博士学生出题，问题要求考察博士是否完全掌握该材料的合成方案，是否完全掌握给定材料的结构和性能到合成方案的精准映射关系，如果是你你会怎么出题？
你的问题里面应该包含该材料充分的结构和性能信息，问题需要有难度和深度，需要博士生们深入思考和推理后才能给出准确的合成方案并整理成JSON格式的格式化合成方案。
由于问题面向博士，因此，提出的问题需要一定的科研价值导向，并且要求博士在回答该材料的合成方案时给出精确的数值（包括试剂、前驱体、容器、温度等合成条件）。
问题中作为条件信息的部分需要尽可能的在问题中明确而不是隐晦（你要考虑到博士们拿到问题的时候并不知道上文中的信息，所以类似“基于给定的材料结构和性能信息”这种问法应该尽量避免）。

请先生成6个问题示例，6个问题的语言一半是中文，一半是英文，再挑选2个最好的问题示例并遵循如下格式输出：
```json
{
  "selected_questions": [
    {
      "question_id": 1,
      "question_text": "问题1的完整内容...",
    },
    {
      "question_id": 2,
      "question_text": "问题2的完整内容...",
    },
  ]
}
"""
    instruction = instruction.replace("{paper_info}", paper_info)
    messages=[
        {"role": "system", "content": ""},
        {"role": "user", "content": instruction}
    ]
    import time
    start_time = time.time()
    _response = utils.get_response_from_llm(messages, model_name="deepseek-v3", max_retries=max_retries, initial_backoff=initial_backoff)
    # reasoning_content, _response = utils.get_response_from_deepseek_r1(messages, max_retries=max_retries, initial_backoff=initial_backoff)
    # print(f"Time: {time.time() - start_time}")
    if _response == 'apierror' or _response == 'unexpectederror':
        return _response
    # 尝试从响应中提取JSON部分
    json_match = re.search(r'```json\s*(.*?)\s*```', _response, re.DOTALL)
    if json_match:
        json_str = json_match.group(1)
        try:
            questions_data = json.loads(json_str)
            return questions_data
        except json.JSONDecodeError:
            # 如果JSON解析失败，尝试清理字符串后再次解析
            cleaned_json = re.sub(r'[\n\r\t]', '', json_str)
            try:
                questions_data = json.loads(cleaned_json)
                return questions_data
            except:
                return {"error": "Failed to parse JSON response", "raw_response": _response}
    else:
        # 如果没有找到JSON格式，返回原始响应
        return {"error": "No JSON format found in response", "raw_response": _response}


def generate_function_call(messages, tools, max_retries=3, initial_backoff=1.0):

    import time
    start_time = time.time()
    instruction = """
# 问题
{question}

# 指令
在准确的回答上述问题之前，你只有现在这一次机会允许你调用工具以获取更多信息。
请尽可能深入思考上述问题，并尽可能的调用多个提供给你的工具查询该问题的相关信息，而不是直接回答该问题。
因此，你需要在回答中一次给出多个经过思考后的工具调用，以便更好地回答上述问题。
思考和回答时使用和问题相同的语言。
"""
    messages[0]["content"] = instruction.replace("{question}", messages[0]["content"])

    _response, functions = utils.get_response_from_qwq(messages, model_name="qwq-32b", tools=tools, max_retries=max_retries, initial_backoff=initial_backoff)
    # reasoning_content, _response = utils.get_response_from_deepseek_r1(messages, max_retries=max_retries, initial_backoff=initial_backoff)
    # print(f"Time: {time.time() - start_time}")
    # print(_response)
    # if _response == 'apierror' or _response == 'unexpectederror':
    #     return _response
    return _response, functions


def generate_obs_response(messages, max_retries=3, initial_backoff=1.0):
    import time
    start_time = time.time()
    _reasoning_content, response = utils.get_response_from_deepseek_r1(messages, prefix=False, max_retries=max_retries, initial_backoff=initial_backoff)
    return _reasoning_content, response