213 lines
7.7 KiB
Python
Executable File
213 lines
7.7 KiB
Python
Executable File
"""
|
||
CIF Utilities Module
|
||
|
||
This module provides basic functions for handling CIF (Crystallographic Information File) files,
|
||
which are commonly used in materials science for representing crystal structures.
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import os
|
||
from ase.io import read
|
||
import tempfile
|
||
from typing import Optional, Tuple
|
||
from ase import Atoms
|
||
from ...core.config import material_config
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
def read_cif_txt_file(file_path):
|
||
"""
|
||
Read the CIF file and return its content.
|
||
|
||
Args:
|
||
file_path: Path to the CIF file
|
||
|
||
Returns:
|
||
String content of the CIF file or None if an error occurs
|
||
"""
|
||
try:
|
||
with open(file_path, 'r', encoding='utf-8') as f:
|
||
return f.read()
|
||
except Exception as e:
|
||
logger.error(f"Error reading file {file_path}: {e}")
|
||
return None
|
||
|
||
def extract_cif_info(path: str, fields_name: list):
|
||
"""
|
||
Extract specific fields from the CIF description JSON file.
|
||
|
||
Args:
|
||
path: Path to the JSON file containing CIF information
|
||
fields_name: List of field categories to extract. Use 'all_fields' to extract all fields.
|
||
Other options include 'basic_fields', 'energy_electronic_fields', 'metal_magentic_fields'
|
||
|
||
Returns:
|
||
Dictionary containing the extracted fields
|
||
"""
|
||
basic_fields = ['formula_pretty', 'chemsys', 'composition', 'elements', 'symmetry', 'nsites', 'volume', 'density']
|
||
energy_electronic_fields = ['formation_energy_per_atom', 'energy_above_hull', 'is_stable', 'efermi', 'cbm', 'vbm', 'band_gap', 'is_gap_direct']
|
||
metal_magentic_fields = ['is_metal', 'is_magnetic', "ordering", 'total_magnetization', 'num_magnetic_sites']
|
||
|
||
selected_fields = []
|
||
if fields_name[0] == 'all_fields':
|
||
selected_fields = basic_fields + energy_electronic_fields + metal_magentic_fields
|
||
else:
|
||
for field in fields_name:
|
||
selected_fields.extend(locals().get(field, []))
|
||
|
||
with open(path, 'r') as f:
|
||
docs = json.load(f)
|
||
|
||
new_docs = {}
|
||
for field_name in selected_fields:
|
||
new_docs[field_name] = docs.get(field_name, '')
|
||
|
||
return new_docs
|
||
|
||
def remove_symmetry_equiv_xyz(cif_content):
|
||
"""
|
||
Remove symmetry operations section from CIF file content.
|
||
|
||
This is often useful when working with CIF files in certain visualization tools
|
||
or when focusing on the basic structure without symmetry operations.
|
||
|
||
Args:
|
||
cif_content: CIF file content string
|
||
|
||
Returns:
|
||
Cleaned CIF content string with symmetry operations removed
|
||
"""
|
||
lines = cif_content.split('\n')
|
||
output_lines = []
|
||
|
||
i = 0
|
||
while i < len(lines):
|
||
line = lines[i].strip()
|
||
|
||
# 检测循环开始
|
||
if line == 'loop_':
|
||
# 查看下一行,检查是否是对称性循环
|
||
next_lines = []
|
||
j = i + 1
|
||
while j < len(lines) and lines[j].strip().startswith('_'):
|
||
next_lines.append(lines[j].strip())
|
||
j += 1
|
||
|
||
# 检查是否包含对称性操作标签
|
||
if any('_symmetry_equiv_pos_as_xyz' in tag for tag in next_lines):
|
||
# 跳过整个循环块
|
||
while i < len(lines):
|
||
if i + 1 >= len(lines):
|
||
break
|
||
|
||
next_line = lines[i + 1].strip()
|
||
# 检查是否到达下一个循环或数据块
|
||
if next_line == 'loop_' or next_line.startswith('data_'):
|
||
break
|
||
|
||
# 检查是否到达原子位置部分
|
||
if next_line.startswith('_atom_site_'):
|
||
break
|
||
|
||
i += 1
|
||
else:
|
||
# 不是对称性循环,保留loop_行
|
||
output_lines.append(lines[i])
|
||
else:
|
||
# 非循环开始行,直接保留
|
||
output_lines.append(lines[i])
|
||
|
||
i += 1
|
||
|
||
return '\n'.join(output_lines)
|
||
|
||
def read_structure_from_file_name_or_content_string(file_name_or_content_string: str, format_type: str = "auto") -> Tuple[str, str]:
|
||
"""
|
||
处理结构输入,判断是文件名还是直接内容
|
||
|
||
当file_name_or_content_string被视为文件名时,会在material_config.TEMP_ROOT目录下查找该文件。
|
||
这适用于大模型生成的临时文件,这些文件通常存储在临时目录中。
|
||
|
||
Args:
|
||
file_name_or_content_string: 文件名或结构内容字符串
|
||
format_type: 结构格式类型,"auto"表示自动检测
|
||
|
||
Returns:
|
||
tuple: (内容字符串, 实际格式类型)
|
||
"""
|
||
# 首先检查是否是完整路径的文件
|
||
if os.path.exists(file_name_or_content_string) and os.path.isfile(file_name_or_content_string):
|
||
# 是完整路径文件,读取文件内容
|
||
with open(file_name_or_content_string, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 如果格式为auto,从文件扩展名推断
|
||
if format_type == "auto":
|
||
ext = os.path.splitext(file_name_or_content_string)[1].lower().lstrip('.')
|
||
if ext in ['cif', 'xyz', 'vasp', 'poscar']:
|
||
format_type = 'cif' if ext == 'cif' else 'xyz' if ext == 'xyz' else 'vasp'
|
||
else:
|
||
# 默认假设为CIF
|
||
format_type = 'cif'
|
||
else:
|
||
# 检查是否是临时目录中的文件名
|
||
temp_path = os.path.join(material_config.TEMP_ROOT, file_name_or_content_string)
|
||
if os.path.exists(temp_path) and os.path.isfile(temp_path):
|
||
# 是临时目录中的文件,读取文件内容
|
||
with open(temp_path, 'r', encoding='utf-8') as f:
|
||
content = f.read()
|
||
|
||
# 如果格式为auto,从文件扩展名推断
|
||
if format_type == "auto":
|
||
ext = os.path.splitext(temp_path)[1].lower().lstrip('.')
|
||
if ext in ['cif', 'xyz', 'vasp', 'poscar']:
|
||
format_type = 'cif' if ext == 'cif' else 'xyz' if ext == 'xyz' else 'vasp'
|
||
else:
|
||
# 默认假设为CIF
|
||
format_type = 'cif'
|
||
else:
|
||
# 不是文件路径,假设是直接内容
|
||
content = file_name_or_content_string
|
||
|
||
# 如果格式为auto,尝试从内容推断
|
||
if format_type == "auto":
|
||
# 简单启发式判断:
|
||
# CIF文件通常包含"data_"和"_cell_"
|
||
if "data_" in content and "_cell_" in content:
|
||
format_type = "cif"
|
||
# XYZ文件通常第一行是原子数量
|
||
elif content.strip().split('\n')[0].strip().isdigit():
|
||
format_type = "xyz"
|
||
# POSCAR/VASP格式通常第一行是注释
|
||
elif len(content.strip().split('\n')) > 5 and all(len(line.split()) == 3 for line in content.strip().split('\n')[2:5]):
|
||
format_type = "vasp"
|
||
# 默认假设为CIF
|
||
else:
|
||
format_type = "cif"
|
||
|
||
return content, format_type
|
||
|
||
def convert_structure(input_format: str='cif', content: str=None) -> Optional[Atoms]:
|
||
"""
|
||
将输入内容转换为Atoms对象
|
||
|
||
Args:
|
||
input_format: 输入格式 (cif, xyz, vasp等)
|
||
content: 结构内容字符串
|
||
|
||
Returns:
|
||
ASE Atoms对象,如果转换失败则返回None
|
||
"""
|
||
try:
|
||
with tempfile.NamedTemporaryFile(suffix=f".{input_format}", mode="w", delete=False) as tmp_file:
|
||
tmp_file.write(content)
|
||
tmp_path = tmp_file.name
|
||
|
||
atoms = read(tmp_path)
|
||
os.unlink(tmp_path)
|
||
return atoms
|
||
except Exception as e:
|
||
logger.error(f"Failed to convert structure: {str(e)}")
|
||
return None
|