multi_mcp/sci_mcp/material_mcp/support/utils.py

"""
CIF Utilities Module

This module provides basic functions for handling CIF (Crystallographic Information File) files,
which are commonly used in materials science for representing crystal structures.
"""

import json
import logging
import os
from ase.io import read
import tempfile
from typing import Optional, Tuple
from ase import Atoms
from ...core.config import material_config

logger = logging.getLogger(__name__)

def read_cif_txt_file(file_path):
    """
    Read the CIF file and return its content.

    Args:
        file_path: Path to the CIF file

    Returns:
        String content of the CIF file or None if an error occurs
    """
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except Exception as e:
        logger.error(f"Error reading file {file_path}: {e}")
        return None

def extract_cif_info(path: str, fields_name: list):
    """
    Extract specific fields from the CIF description JSON file.

    Args:
        path: Path to the JSON file containing CIF information
        fields_name: List of field categories to extract. Use 'all_fields' to extract all fields.
                    Other options include 'basic_fields', 'energy_electronic_fields', 'metal_magentic_fields'

    Returns:
        Dictionary containing the extracted fields
    """
    basic_fields = ['formula_pretty', 'chemsys', 'composition', 'elements', 'symmetry', 'nsites', 'volume', 'density']
    energy_electronic_fields = ['formation_energy_per_atom', 'energy_above_hull', 'is_stable', 'efermi', 'cbm', 'vbm', 'band_gap', 'is_gap_direct']
    metal_magentic_fields = ['is_metal', 'is_magnetic', "ordering", 'total_magnetization', 'num_magnetic_sites']

    selected_fields = []
    if fields_name[0] == 'all_fields':
        selected_fields = basic_fields + energy_electronic_fields + metal_magentic_fields
    else:
        for field in fields_name:
            selected_fields.extend(locals().get(field, []))

    with open(path, 'r') as f:
        docs = json.load(f)

    new_docs = {}
    for field_name in selected_fields:
        new_docs[field_name] = docs.get(field_name, '')

    return new_docs

def remove_symmetry_equiv_xyz(cif_content):
    """
    Remove symmetry operations section from CIF file content.

    This is often useful when working with CIF files in certain visualization tools
    or when focusing on the basic structure without symmetry operations.

    Args:
        cif_content: CIF file content string

    Returns:
        Cleaned CIF content string with symmetry operations removed
    """
    lines = cif_content.split('\n')
    output_lines = []

    i = 0
    while i < len(lines):
        line = lines[i].strip()

        # 检测循环开始
        if line == 'loop_':
            # 查看下一行，检查是否是对称性循环
            next_lines = []
            j = i + 1
            while j < len(lines) and lines[j].strip().startswith('_'):
                next_lines.append(lines[j].strip())
                j += 1

            # 检查是否包含对称性操作标签
            if any('_symmetry_equiv_pos_as_xyz' in tag for tag in next_lines):
                # 跳过整个循环块
                while i < len(lines):
                    if i + 1 >= len(lines):
                        break

                    next_line = lines[i + 1].strip()
                    # 检查是否到达下一个循环或数据块
                    if next_line == 'loop_' or next_line.startswith('data_'):
                        break

                    # 检查是否到达原子位置部分
                    if next_line.startswith('_atom_site_'):
                        break

                    i += 1
            else:
                # 不是对称性循环，保留loop_行
                output_lines.append(lines[i])
        else:
            # 非循环开始行，直接保留
            output_lines.append(lines[i])

        i += 1

    return '\n'.join(output_lines)

def read_structure_from_file_name_or_content_string(file_name_or_content_string: str, format_type: str = "auto") -> Tuple[str, str]:
    """
    处理结构输入，判断是文件名还是直接内容

    当file_name_or_content_string被视为文件名时，会在material_config.TEMP_ROOT目录下查找该文件。
    这适用于大模型生成的临时文件，这些文件通常存储在临时目录中。

    Args:
        file_name_or_content_string: 文件名或结构内容字符串
        format_type: 结构格式类型，"auto"表示自动检测

    Returns:
        tuple: (内容字符串, 实际格式类型)
    """
    # 首先检查是否是完整路径的文件
    if os.path.exists(file_name_or_content_string) and os.path.isfile(file_name_or_content_string):
        # 是完整路径文件，读取文件内容
        with open(file_name_or_content_string, 'r', encoding='utf-8') as f:
            content = f.read()

        # 如果格式为auto，从文件扩展名推断
        if format_type == "auto":
            ext = os.path.splitext(file_name_or_content_string)[1].lower().lstrip('.')
            if ext in ['cif', 'xyz', 'vasp', 'poscar']:
                format_type = 'cif' if ext == 'cif' else 'xyz' if ext == 'xyz' else 'vasp'
            else:
                # 默认假设为CIF
                format_type = 'cif'
    else:
        # 检查是否是临时目录中的文件名
        temp_path = os.path.join(material_config.TEMP_ROOT, file_name_or_content_string)
        if os.path.exists(temp_path) and os.path.isfile(temp_path):
            # 是临时目录中的文件，读取文件内容
            with open(temp_path, 'r', encoding='utf-8') as f:
                content = f.read()

            # 如果格式为auto，从文件扩展名推断
            if format_type == "auto":
                ext = os.path.splitext(temp_path)[1].lower().lstrip('.')
                if ext in ['cif', 'xyz', 'vasp', 'poscar']:
                    format_type = 'cif' if ext == 'cif' else 'xyz' if ext == 'xyz' else 'vasp'
                else:
                    # 默认假设为CIF
                    format_type = 'cif'
        else:
            # 不是文件路径，假设是直接内容
            content = file_name_or_content_string

            # 如果格式为auto，尝试从内容推断
            if format_type == "auto":
                # 简单启发式判断：
                # CIF文件通常包含"data_"和"_cell_"
                if "data_" in content and "_cell_" in content:
                    format_type = "cif"
                # XYZ文件通常第一行是原子数量
                elif content.strip().split('\n')[0].strip().isdigit():
                    format_type = "xyz"
                # POSCAR/VASP格式通常第一行是注释
                elif len(content.strip().split('\n')) > 5 and all(len(line.split()) == 3 for line in content.strip().split('\n')[2:5]):
                    format_type = "vasp"
                # 默认假设为CIF
                else:
                    format_type = "cif"

    return content, format_type

def convert_structure(input_format: str='cif', content: str=None) -> Optional[Atoms]:
    """
    将输入内容转换为Atoms对象

    Args:
        input_format: 输入格式 (cif, xyz, vasp等)
        content: 结构内容字符串

    Returns:
        ASE Atoms对象，如果转换失败则返回None
    """
    try:
        with tempfile.NamedTemporaryFile(suffix=f".{input_format}", mode="w", delete=False) as tmp_file:
            tmp_file.write(content)
            tmp_path = tmp_file.name

        atoms = read(tmp_path)
        os.unlink(tmp_path)
        return atoms
    except Exception as e:
        logger.error(f"Failed to convert structure: {str(e)}")
        return None