1155 lines
41 KiB
Python
1155 lines
41 KiB
Python
"""
|
||
RDKit Tools Module
|
||
|
||
This module provides tools for molecular analysis, manipulation, and visualization
|
||
using the RDKit library. It includes functions for calculating molecular descriptors,
|
||
generating molecular fingerprints, analyzing molecular structures, and more.
|
||
"""
|
||
from __future__ import annotations
|
||
|
||
import logging
|
||
from typing import Dict, List, Union, Optional, Any, Tuple
|
||
|
||
import numpy as np
|
||
from rdkit import Chem
|
||
from rdkit.Chem import AllChem, Descriptors, rdMolDescriptors, Draw
|
||
from rdkit.Chem import rdmolops, rdDetermineBonds
|
||
from rdkit.DataStructs import ConvertToNumpyArray
|
||
from ...core.llm_tools import llm_tool
|
||
|
||
# Set up logging
|
||
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
||
|
||
|
||
def _preprocess_smiles(smiles: str) -> str:
|
||
"""
|
||
Preprocess SMILES string by removing whitespace and special characters.
|
||
|
||
Args:
|
||
smiles: SMILES string to preprocess
|
||
|
||
Returns:
|
||
Preprocessed SMILES string
|
||
"""
|
||
return smiles.replace(" ", "").replace("\n", "").replace("\'", "").replace("\"", "").replace(".", "")
|
||
|
||
|
||
def _validate_molecule(smiles: str) -> Chem.Mol:
|
||
"""
|
||
Validate SMILES string and convert to RDKit molecule.
|
||
|
||
Args:
|
||
smiles: SMILES string to validate
|
||
|
||
Returns:
|
||
RDKit molecule object
|
||
|
||
Raises:
|
||
ValueError: If SMILES string is invalid
|
||
"""
|
||
mol = Chem.MolFromSmiles(smiles)
|
||
if mol is None:
|
||
raise ValueError("Invalid SMILES string.")
|
||
return mol
|
||
|
||
|
||
#------------------------------------------------------------------------------
|
||
# Molecular Descriptors
|
||
#------------------------------------------------------------------------------
|
||
|
||
@llm_tool(name="calculate_molecular_properties_rdkit",
|
||
description="Calculate basic molecular properties for a chemical compound using RDKit library")
|
||
def calculate_molecular_properties_rdkit(smiles: str) -> str:
|
||
"""
|
||
Calculate basic molecular properties for a chemical compound.
|
||
|
||
This function computes a comprehensive set of molecular properties including
|
||
basic information (formula, weight), physical properties (LogP, TPSA),
|
||
and structural features (hydrogen bond donors/acceptors, rotatable bonds).
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the calculated molecular properties.
|
||
|
||
Examples:
|
||
>>> calculate_molecular_properties("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns properties of Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Calculate properties
|
||
mol_weight = Descriptors.ExactMolWt(mol)
|
||
formula = rdMolDescriptors.CalcMolFormula(mol)
|
||
tpsa = Descriptors.TPSA(mol)
|
||
logp = Descriptors.MolLogP(mol)
|
||
hba = Descriptors.NumHAcceptors(mol)
|
||
hbd = Descriptors.NumHDonors(mol)
|
||
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
|
||
heavy_atoms = mol.GetNumHeavyAtoms()
|
||
ring_count = Descriptors.RingCount(mol)
|
||
aromatic_rings = Descriptors.NumAromaticRings(mol)
|
||
|
||
# Format output
|
||
markdown = f"""## Basic Molecular Properties
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
|
||
### Basic Information
|
||
- **Formula:** {formula}
|
||
- **Molecular Weight:** {mol_weight:.4f} g/mol
|
||
- **Heavy Atom Count:** {heavy_atoms}
|
||
|
||
### Physicochemical Properties
|
||
- **LogP:** {logp:.2f}
|
||
- **Topological Polar Surface Area (TPSA):** {tpsa:.2f} Ų
|
||
- **H-Bond Acceptors:** {hba}
|
||
- **H-Bond Donors:** {hbd}
|
||
|
||
### Structural Features
|
||
- **Rotatable Bonds:** {rotatable_bonds}
|
||
- **Ring Count:** {ring_count}
|
||
- **Aromatic Rings:** {aromatic_rings}
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="calculate_drug_likeness_rdkit",
|
||
description="Calculate drug-likeness properties for a chemical compound using RDKit library")
|
||
def calculate_drug_likeness_rdkit(smiles: str) -> str:
|
||
"""
|
||
Calculate drug-likeness properties for a chemical compound.
|
||
|
||
This function evaluates whether a molecule satisfies various drug-likeness rules
|
||
including Lipinski's Rule of Five, Ghose Filter, Veber Filter, and PAINS filter.
|
||
These rules help assess the compound's potential as a drug candidate.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the drug-likeness assessment results.
|
||
|
||
Examples:
|
||
>>> calculate_drug_likeness("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns drug-likeness assessment for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Calculate properties for Lipinski's Rule of Five
|
||
mol_weight = Descriptors.ExactMolWt(mol)
|
||
logp = Descriptors.MolLogP(mol)
|
||
hba = Descriptors.NumHAcceptors(mol)
|
||
hbd = Descriptors.NumHDonors(mol)
|
||
|
||
# Calculate properties for Ghose Filter
|
||
molar_refractivity = Descriptors.MolMR(mol)
|
||
n_atoms = mol.GetNumAtoms(onlyExplicit=0) # 计算所有原子,包括氢原子
|
||
|
||
# Calculate properties for Veber Filter
|
||
tpsa = Descriptors.TPSA(mol)
|
||
rotatable_bonds = Descriptors.NumRotatableBonds(mol)
|
||
|
||
# Evaluate Lipinski's Rule of Five
|
||
lipinski_violations = 0
|
||
if mol_weight > 500: lipinski_violations += 1
|
||
if logp > 5: lipinski_violations += 1
|
||
if hba > 10: lipinski_violations += 1
|
||
if hbd > 5: lipinski_violations += 1
|
||
|
||
# Evaluate Ghose Filter
|
||
ghose_compliant = (
|
||
160 <= mol_weight <= 480 and
|
||
-0.4 <= logp <= 5.6 and
|
||
40 <= molar_refractivity <= 130 and
|
||
20 <= n_atoms <= 70
|
||
)
|
||
|
||
# Evaluate Veber Filter
|
||
veber_compliant = (
|
||
rotatable_bonds <= 10 and
|
||
tpsa <= 140
|
||
)
|
||
|
||
# Format output
|
||
markdown = f"""## Drug-likeness Assessment
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
|
||
### Lipinski's Rule of Five
|
||
- **Molecular Weight ≤ 500:** {mol_weight:.1f} g/mol {'✓' if mol_weight <= 500 else '✗'}
|
||
- **LogP ≤ 5:** {logp:.2f} {'✓' if logp <= 5 else '✗'}
|
||
- **H-Bond Acceptors ≤ 10:** {hba} {'✓' if hba <= 10 else '✗'}
|
||
- **H-Bond Donors ≤ 5:** {hbd} {'✓' if hbd <= 5 else '✗'}
|
||
- **Number of Violations:** {lipinski_violations}
|
||
- **Conclusion:** {'Compliant' if lipinski_violations <= 1 else 'Non-compliant'} with Lipinski's Rule of Five
|
||
|
||
### Ghose Filter
|
||
- **Molecular Weight Range [160, 480]:** {mol_weight:.1f} g/mol {'✓' if 160 <= mol_weight <= 480 else '✗'}
|
||
- **LogP Range [-0.4, 5.6]:** {logp:.2f} {'✓' if -0.4 <= logp <= 5.6 else '✗'}
|
||
- **Molar Refractivity Range [40, 130]:** {molar_refractivity:.2f} {'✓' if 40 <= molar_refractivity <= 130 else '✗'}
|
||
- **Atom Count Range [20, 70]:** {n_atoms} {'✓' if 20 <= n_atoms <= 70 else '✗'}
|
||
- **Conclusion:** {'Compliant' if ghose_compliant else 'Non-compliant'} with Ghose Filter
|
||
|
||
### Veber Filter
|
||
- **Rotatable Bonds ≤ 10:** {rotatable_bonds} {'✓' if rotatable_bonds <= 10 else '✗'}
|
||
- **Polar Surface Area ≤ 140 Ų:** {tpsa:.2f} Ų {'✓' if tpsa <= 140 else '✗'}
|
||
- **Conclusion:** {'Compliant' if veber_compliant else 'Non-compliant'} with Veber Filter
|
||
|
||
### Overall Assessment
|
||
This compound {'complies with most drug-likeness rules' if (lipinski_violations <= 1 and (ghose_compliant or veber_compliant)) else 'does not comply with major drug-likeness rules'} and {'is likely' if (lipinski_violations <= 1 and (ghose_compliant or veber_compliant)) else 'is unlikely'} to be a good drug candidate.
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="calculate_topological_descriptors_rdkit",
|
||
description="Calculate topological descriptors for a chemical compound using RDKit library")
|
||
def calculate_topological_descriptors_rdkit(smiles: str) -> str:
|
||
"""
|
||
Calculate topological descriptors for a chemical compound.
|
||
|
||
This function computes various topological descriptors that characterize
|
||
the molecular structure based on its connectivity, without considering
|
||
3D coordinates. These descriptors are useful for QSAR studies and
|
||
molecular similarity analysis.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the calculated topological descriptors.
|
||
|
||
Examples:
|
||
>>> calculate_topological_descriptors("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns topological descriptors for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Calculate topological descriptors
|
||
chi0v = Descriptors.Chi0v(mol)
|
||
chi1v = Descriptors.Chi1v(mol)
|
||
chi2v = Descriptors.Chi2v(mol)
|
||
chi3v = Descriptors.Chi3v(mol)
|
||
chi4v = Descriptors.Chi4v(mol)
|
||
|
||
kappa1 = Descriptors.Kappa1(mol)
|
||
kappa2 = Descriptors.Kappa2(mol)
|
||
kappa3 = Descriptors.Kappa3(mol)
|
||
|
||
balaban_j = Descriptors.BalabanJ(mol)
|
||
bertz_ct = Descriptors.BertzCT(mol)
|
||
|
||
# Format output
|
||
markdown = f"""## Topological Descriptors
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
|
||
### Connectivity Indices (Chi)
|
||
- **Chi0v:** {chi0v:.4f}
|
||
- **Chi1v:** {chi1v:.4f}
|
||
- **Chi2v:** {chi2v:.4f}
|
||
- **Chi3v:** {chi3v:.4f}
|
||
- **Chi4v:** {chi4v:.4f}
|
||
|
||
### Shape Indices (Kappa)
|
||
- **Kappa1:** {kappa1:.4f}
|
||
- **Kappa2:** {kappa2:.4f}
|
||
- **Kappa3:** {kappa3:.4f}
|
||
|
||
### Other Topological Indices
|
||
- **Balaban J Index:** {balaban_j:.4f}
|
||
- **Bertz CT Index:** {bertz_ct:.4f}
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
#------------------------------------------------------------------------------
|
||
# Molecular Fingerprints
|
||
#------------------------------------------------------------------------------
|
||
|
||
@llm_tool(name="generate_molecular_fingerprints_rdkit",
|
||
description="Generate different types of molecular fingerprints for a chemical compound using RDKit library")
|
||
def generate_molecular_fingerprints_rdkit(
|
||
smiles: str,
|
||
fingerprint_type: str = "morgan",
|
||
radius: int = 2,
|
||
n_bits: int = 1024
|
||
) -> str:
|
||
"""
|
||
Generate different types of molecular fingerprints for a chemical compound.
|
||
|
||
This function generates various types of molecular fingerprints, which are
|
||
binary vectors representing the presence or absence of specific structural
|
||
features in a molecule. These fingerprints are useful for similarity searching,
|
||
clustering, and machine learning applications.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
fingerprint_type: Type of fingerprint to generate. Options: "morgan", "maccs",
|
||
"atompair", "topological", "rdkit". Default: "morgan".
|
||
radius: Radius for Morgan fingerprint (only used if fingerprint_type is "morgan").
|
||
Default: 2.
|
||
n_bits: Number of bits in the fingerprint (only used for some fingerprint types).
|
||
Default: 1024.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the generated fingerprint information.
|
||
|
||
Examples:
|
||
>>> generate_molecular_fingerprints("CC(=O)OC1=CC=CC=C1C(=O)O", "morgan", 2, 1024)
|
||
# Returns Morgan fingerprint for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Generate fingerprint based on type
|
||
fingerprint = None
|
||
fingerprint_name = ""
|
||
|
||
if fingerprint_type.lower() == "morgan":
|
||
fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
|
||
fingerprint_name = f"Morgan (ECFP{radius*2})"
|
||
|
||
elif fingerprint_type.lower() == "maccs":
|
||
fingerprint = AllChem.GetMACCSKeysFingerprint(mol)
|
||
fingerprint_name = "MACCS Keys"
|
||
|
||
elif fingerprint_type.lower() == "atompair":
|
||
fingerprint = AllChem.GetAtomPairFingerprint(mol)
|
||
fingerprint_name = "Atom Pair"
|
||
|
||
elif fingerprint_type.lower() == "topological":
|
||
fingerprint = AllChem.GetTopologicalTorsionFingerprint(mol)
|
||
fingerprint_name = "Topological Torsion"
|
||
|
||
elif fingerprint_type.lower() == "rdkit":
|
||
fingerprint = Chem.RDKFingerprint(mol, fpSize=n_bits)
|
||
fingerprint_name = "RDKit"
|
||
|
||
else:
|
||
return f"Error: Unsupported fingerprint type '{fingerprint_type}'. Supported types: morgan, maccs, atompair, topological, rdkit."
|
||
|
||
# Convert fingerprint to numpy array for easier handling
|
||
if fingerprint_type.lower() in ["morgan", "maccs", "rdkit"]:
|
||
# These are bit vectors, so we can get the number of bits directly
|
||
num_bits = fingerprint.GetNumBits()
|
||
num_on_bits = fingerprint.GetNumOnBits()
|
||
bit_info = f"- **总位数:** {num_bits}\n- **激活位数:** {num_on_bits}\n- **密度:** {num_on_bits/num_bits:.4f}"
|
||
|
||
# Convert to binary string (limit length for readability)
|
||
binary = fingerprint.ToBitString()
|
||
if len(binary) > 100:
|
||
binary_display = binary[:100] + "..."
|
||
else:
|
||
binary_display = binary
|
||
|
||
else:
|
||
# These are count vectors, so we need to handle differently
|
||
bit_info = "- 非位向量指纹,包含计数信息"
|
||
binary_display = "不适用于此指纹类型"
|
||
|
||
# Format output
|
||
markdown = f"""## Molecular Fingerprints
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
**Fingerprint Type:** {fingerprint_name}
|
||
|
||
### Fingerprint Information
|
||
{bit_info.replace("总位数", "Total Bits").replace("激活位数", "On Bits").replace("密度", "Density").replace("非位向量指纹,包含计数信息", "Non-bit vector fingerprint, contains count information").replace("不适用于此指纹类型", "Not applicable for this fingerprint type")}
|
||
|
||
### Fingerprint Bit Pattern (First 100 bits)
|
||
```
|
||
{binary_display}
|
||
```
|
||
|
||
### Applications
|
||
- Molecular similarity searching
|
||
- Compound clustering
|
||
- Building QSAR/QSPR models
|
||
- Virtual screening
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="calculate_molecular_similarity_rdkit",
|
||
description="Calculate similarity between two molecules using fingerprints with RDKit library")
|
||
def calculate_molecular_similarity_rdkit(
|
||
smiles1: str,
|
||
smiles2: str,
|
||
fingerprint_type: str = "morgan",
|
||
radius: int = 2,
|
||
n_bits: int = 1024,
|
||
similarity_metric: str = "tanimoto"
|
||
) -> str:
|
||
"""
|
||
Calculate similarity between two molecules using fingerprints.
|
||
|
||
This function computes the similarity between two molecules based on their
|
||
molecular fingerprints. It supports different fingerprint types and similarity
|
||
metrics, making it versatile for various cheminformatics applications.
|
||
|
||
Args:
|
||
smiles1: SMILES notation of the first molecule.
|
||
smiles2: SMILES notation of the second molecule.
|
||
fingerprint_type: Type of fingerprint to use. Options: "morgan", "maccs",
|
||
"rdkit". Default: "morgan".
|
||
radius: Radius for Morgan fingerprint (only used if fingerprint_type is "morgan").
|
||
Default: 2.
|
||
n_bits: Number of bits in the fingerprint. Default: 1024.
|
||
similarity_metric: Similarity metric to use. Options: "tanimoto", "dice",
|
||
"cosine". Default: "tanimoto".
|
||
|
||
Returns:
|
||
A formatted Markdown string with the similarity calculation results.
|
||
|
||
Examples:
|
||
>>> calculate_molecular_similarity("CC(=O)OC1=CC=CC=C1C(=O)O", "CC(=O)OCCC(=O)O")
|
||
# Returns similarity between Aspirin and another molecule
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles1 = _preprocess_smiles(smiles1)
|
||
smiles2 = _preprocess_smiles(smiles2)
|
||
|
||
# Validate molecules
|
||
mol1 = _validate_molecule(smiles1)
|
||
mol2 = _validate_molecule(smiles2)
|
||
|
||
# Generate fingerprints based on type
|
||
fp1 = None
|
||
fp2 = None
|
||
fingerprint_name = ""
|
||
|
||
if fingerprint_type.lower() == "morgan":
|
||
fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, radius, nBits=n_bits)
|
||
fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, radius, nBits=n_bits)
|
||
fingerprint_name = f"Morgan (ECFP{radius*2})"
|
||
|
||
elif fingerprint_type.lower() == "maccs":
|
||
fp1 = AllChem.GetMACCSKeysFingerprint(mol1)
|
||
fp2 = AllChem.GetMACCSKeysFingerprint(mol2)
|
||
fingerprint_name = "MACCS Keys"
|
||
|
||
elif fingerprint_type.lower() == "rdkit":
|
||
fp1 = Chem.RDKFingerprint(mol1, fpSize=n_bits)
|
||
fp2 = Chem.RDKFingerprint(mol2, fpSize=n_bits)
|
||
fingerprint_name = "RDKit"
|
||
|
||
else:
|
||
return f"Error: Unsupported fingerprint type '{fingerprint_type}'. Supported types: morgan, maccs, rdkit."
|
||
|
||
# Calculate similarity based on metric
|
||
similarity = 0.0
|
||
metric_name = ""
|
||
|
||
if similarity_metric.lower() == "tanimoto":
|
||
from rdkit import DataStructs
|
||
similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
|
||
metric_name = "Tanimoto"
|
||
|
||
elif similarity_metric.lower() == "dice":
|
||
from rdkit import DataStructs
|
||
similarity = DataStructs.DiceSimilarity(fp1, fp2)
|
||
metric_name = "Dice"
|
||
|
||
elif similarity_metric.lower() == "cosine":
|
||
from rdkit import DataStructs
|
||
similarity = DataStructs.CosineSimilarity(fp1, fp2)
|
||
metric_name = "Cosine"
|
||
|
||
else:
|
||
return f"Error: Unsupported similarity metric '{similarity_metric}'. Supported metrics: tanimoto, dice, cosine."
|
||
|
||
# Get basic molecule information
|
||
mol1_formula = rdMolDescriptors.CalcMolFormula(mol1)
|
||
mol2_formula = rdMolDescriptors.CalcMolFormula(mol2)
|
||
|
||
# Format output
|
||
markdown = f"""## Molecular Similarity Calculation
|
||
|
||
### Molecule Information
|
||
- **Molecule 1 SMILES:** `{smiles1}`
|
||
- **Molecule 1 Formula:** {mol1_formula}
|
||
- **Molecule 2 SMILES:** `{smiles2}`
|
||
- **Molecule 2 Formula:** {mol2_formula}
|
||
|
||
### Similarity Results
|
||
- **Fingerprint Type:** {fingerprint_name}
|
||
- **Similarity Metric:** {metric_name}
|
||
- **Similarity Score:** {similarity:.4f} (Range: 0-1)
|
||
|
||
### Similarity Interpretation
|
||
- **0.0-0.2:** Very low similarity
|
||
- **0.2-0.4:** Low similarity
|
||
- **0.4-0.6:** Moderate similarity
|
||
- **0.6-0.8:** High similarity
|
||
- **0.8-1.0:** Very high similarity
|
||
|
||
**Conclusion:** These two molecules have **{'very low' if similarity < 0.2 else 'low' if similarity < 0.4 else 'moderate' if similarity < 0.6 else 'high' if similarity < 0.8 else 'very high'}** structural similarity.
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
#------------------------------------------------------------------------------
|
||
# Molecular Structure Analysis
|
||
#------------------------------------------------------------------------------
|
||
|
||
@llm_tool(name="analyze_molecular_structure_rdkit",
|
||
description="Analyze the structure of a molecule including atoms, bonds, rings, and functional groups using RDKit library")
|
||
def analyze_molecular_structure_rdkit(smiles: str) -> str:
|
||
"""
|
||
Analyze the structure of a molecule including atoms, bonds, rings, and functional groups.
|
||
|
||
This function provides a comprehensive analysis of a molecule's structure,
|
||
including atom types, bond types, ring systems, and functional groups.
|
||
It helps understand the key structural features of a molecule.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the structural analysis results.
|
||
|
||
Examples:
|
||
>>> analyze_molecular_structure("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns structural analysis for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Get basic molecule information
|
||
formula = rdMolDescriptors.CalcMolFormula(mol)
|
||
|
||
# Analyze atoms
|
||
atoms = mol.GetAtoms()
|
||
atom_counts = {}
|
||
formal_charges = {}
|
||
|
||
for atom in atoms:
|
||
symbol = atom.GetSymbol()
|
||
atom_counts[symbol] = atom_counts.get(symbol, 0) + 1
|
||
|
||
charge = atom.GetFormalCharge()
|
||
if charge != 0:
|
||
formal_charges[atom.GetIdx()] = (symbol, charge)
|
||
|
||
# Analyze bonds
|
||
bonds = mol.GetBonds()
|
||
bond_types = {
|
||
Chem.rdchem.BondType.SINGLE: 0,
|
||
Chem.rdchem.BondType.DOUBLE: 0,
|
||
Chem.rdchem.BondType.TRIPLE: 0,
|
||
Chem.rdchem.BondType.AROMATIC: 0
|
||
}
|
||
|
||
for bond in bonds:
|
||
bond_types[bond.GetBondType()] += 1
|
||
|
||
# Analyze rings
|
||
ring_info = mol.GetRingInfo()
|
||
ring_sizes = {}
|
||
aromatic_rings = 0
|
||
|
||
# Get all rings
|
||
rings = ring_info.AtomRings()
|
||
for ring in rings:
|
||
size = len(ring)
|
||
ring_sizes[size] = ring_sizes.get(size, 0) + 1
|
||
|
||
# Check if ring is aromatic
|
||
if all(mol.GetAtomWithIdx(idx).GetIsAromatic() for idx in ring):
|
||
aromatic_rings += 1
|
||
|
||
# Analyze functional groups (simplified approach)
|
||
functional_groups = []
|
||
|
||
# Check for common functional groups using SMARTS patterns
|
||
smarts_patterns = {
|
||
"醇 (Alcohol)": "[OX2H;!$([OX2H][CX3]=O)]", # 排除羧酸中的羟基
|
||
"醛 (Aldehyde)": "[CX3H1](=O)[#6]",
|
||
"酮 (Ketone)": "[#6][CX3](=O)[#6]",
|
||
"羧酸 (Carboxylic Acid)": "[CX3](=O)[OX2H1]",
|
||
"酯 (Ester)": "[#6][CX3](=O)[OX2][#6]",
|
||
"醚 (Ether)": "[OD2]([CX4])([CX4])", # 匹配连接到两个sp3杂化碳原子的氧原子,排除酯中的氧原子
|
||
"胺 (Amine)": "[NX3;H2,H1,H0;!$(NC=O)]",
|
||
"酰胺 (Amide)": "[NX3][CX3](=[OX1])[#6]",
|
||
"硝基 (Nitro)": "[$([NX3](=O)=O),$([NX3+](=O)[O-])]",
|
||
"磺酸 (Sulfonic Acid)": "[$([#16X4](=[OX1])(=[OX1])([#6])[OX2H,OX1H0-]),$([#16X4+2]([OX1-])([OX1-])([#6])[OX2H,OX1H0-])]",
|
||
"磷酸 (Phosphoric Acid)": "[$([#15X4](=[OX1])(=[OX1])([OX2H,OX1H0-])[OX2H,OX1H0-]),$([#15X4+](=[OX1])([OX1-])([OX2H,OX1H0-])[OX2H,OX1H0-])]",
|
||
"卤素 (Halogen)": "[F,Cl,Br,I]"
|
||
}
|
||
|
||
for name, smarts in smarts_patterns.items():
|
||
pattern = Chem.MolFromSmarts(smarts)
|
||
if pattern and mol.HasSubstructMatch(pattern):
|
||
matches = mol.GetSubstructMatches(pattern)
|
||
functional_groups.append((name, len(matches)))
|
||
|
||
# Format output
|
||
markdown = f"""## Molecular Structure Analysis
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
**Formula:** {formula}
|
||
|
||
### Atom Composition
|
||
| Element | Count |
|
||
|---------|-------|
|
||
"""
|
||
|
||
for symbol, count in sorted(atom_counts.items()):
|
||
markdown += f"| {symbol} | {count} |\n"
|
||
|
||
if formal_charges:
|
||
markdown += "\n### Formal Charges\n"
|
||
for idx, (symbol, charge) in sorted(formal_charges.items()):
|
||
sign = "+" if charge > 0 else "-"
|
||
markdown += f"- Atom {idx} ({symbol}): {sign}{abs(charge)}\n"
|
||
|
||
markdown += "\n### Bond Types\n"
|
||
markdown += f"- Single bonds: {bond_types[Chem.rdchem.BondType.SINGLE]}\n"
|
||
markdown += f"- Double bonds: {bond_types[Chem.rdchem.BondType.DOUBLE]}\n"
|
||
markdown += f"- Triple bonds: {bond_types[Chem.rdchem.BondType.TRIPLE]}\n"
|
||
markdown += f"- Aromatic bonds: {bond_types[Chem.rdchem.BondType.AROMATIC]}\n"
|
||
|
||
if ring_sizes:
|
||
markdown += "\n### Ring Systems\n"
|
||
markdown += f"- Total rings: {len(rings)}\n"
|
||
markdown += f"- Aromatic rings: {aromatic_rings}\n"
|
||
markdown += "- Ring size distribution:\n"
|
||
|
||
for size, count in sorted(ring_sizes.items()):
|
||
markdown += f" - {size}-membered rings: {count}\n"
|
||
|
||
if functional_groups:
|
||
markdown += "\n### Functional Groups\n"
|
||
for name, count in functional_groups:
|
||
markdown += f"- {name}: {count}\n"
|
||
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="generate_molecular_conformer_rdkit",
|
||
description="Generate a 3D conformer for a molecule and calculate 3D descriptors using RDKit library")
|
||
def generate_molecular_conformer_rdkit(smiles: str, num_conformers: int = 1) -> str:
|
||
"""
|
||
Generate a 3D conformer for a molecule and calculate 3D descriptors.
|
||
|
||
This function generates 3D conformers for a molecule using force field
|
||
optimization and calculates various 3D molecular descriptors that depend
|
||
on the molecule's spatial arrangement.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
num_conformers: Number of conformers to generate. Default: 1.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the 3D conformer generation results and
|
||
calculated 3D descriptors.
|
||
|
||
Examples:
|
||
>>> generate_molecular_conformer("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns 3D conformer information for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Add hydrogens
|
||
mol = Chem.AddHs(mol)
|
||
|
||
# Generate 3D conformers
|
||
conf_ids = AllChem.EmbedMultipleConfs(
|
||
mol,
|
||
numConfs=num_conformers,
|
||
randomSeed=42,
|
||
useExpTorsionAnglePrefs=True,
|
||
useBasicKnowledge=True
|
||
)
|
||
|
||
if len(conf_ids) == 0:
|
||
return "Error: Failed to generate conformers. The molecule may be too complex or have structural issues."
|
||
|
||
# Optimize conformers using MMFF
|
||
for conf_id in conf_ids:
|
||
AllChem.MMFFOptimizeMolecule(mol, confId=conf_id)
|
||
|
||
# Calculate 3D descriptors for the first conformer
|
||
conf_id = conf_ids[0]
|
||
|
||
# Calculate principal moments of inertia
|
||
pmi1 = rdMolDescriptors.CalcPMI1(mol, confId=conf_id)
|
||
pmi2 = rdMolDescriptors.CalcPMI2(mol, confId=conf_id)
|
||
pmi3 = rdMolDescriptors.CalcPMI3(mol, confId=conf_id)
|
||
|
||
# Calculate radius of gyration
|
||
rg = rdMolDescriptors.CalcRadiusOfGyration(mol, confId=conf_id)
|
||
|
||
# Calculate spherocity index
|
||
spherocity = rdMolDescriptors.CalcSpherocityIndex(mol, confId=conf_id)
|
||
|
||
# Calculate plane of best fit
|
||
pbf = rdMolDescriptors.CalcPBF(mol, confId=conf_id)
|
||
|
||
# Format output
|
||
markdown = f"""## Molecular 3D Conformer Generation
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
**Number of Conformers Generated:** {len(conf_ids)}
|
||
|
||
### 3D Descriptors (First Conformer)
|
||
- **Principal Moment of Inertia (PMI1):** {pmi1:.4f}
|
||
- **Principal Moment of Inertia (PMI2):** {pmi2:.4f}
|
||
- **Principal Moment of Inertia (PMI3):** {pmi3:.4f}
|
||
- **Radius of Gyration:** {rg:.4f} Å
|
||
- **Spherocity Index:** {spherocity:.4f}
|
||
- **Plane of Best Fit:** {pbf:.4f}
|
||
|
||
### Shape Analysis
|
||
- **Flatness (PMI2/PMI1):** {pmi2/pmi1:.4f}
|
||
- **Elongation (PMI3/PMI2):** {pmi3/pmi2:.4f}
|
||
- **Shape Type:** {'Spherical' if spherocity > 0.8 else 'Flat' if pmi2/pmi1 < 1.5 else 'Elongated' if pmi3/pmi2 > 2.0 else 'Intermediate shape'}
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="identify_scaffolds_rdkit",
|
||
description="Identify and analyze molecular scaffolds in a compound using RDKit library")
|
||
def identify_scaffolds_rdkit(smiles: str) -> str:
|
||
"""
|
||
Identify and analyze molecular scaffolds in a compound.
|
||
|
||
This function extracts the Murcko scaffold and framework from a molecule,
|
||
which represent the core structure without side chains. It's useful for
|
||
analyzing the structural core of drug-like molecules.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the scaffold analysis results.
|
||
|
||
Examples:
|
||
>>> identify_scaffolds("CC(=O)OC1=CC=CC=C1C(=O)O")
|
||
# Returns scaffold analysis for Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Get Murcko scaffold
|
||
from rdkit.Chem.Scaffolds import MurckoScaffold
|
||
scaffold = MurckoScaffold.GetScaffoldForMol(mol)
|
||
scaffold_smiles = Chem.MolToSmiles(scaffold) if scaffold else "N/A"
|
||
|
||
# Get framework (scaffold without bond orders)
|
||
framework = MurckoScaffold.MurckoScaffoldSmiles(smiles=smiles)
|
||
|
||
# Get generic framework (all atoms replaced with carbons)
|
||
generic_scaffold = MurckoScaffold.MakeScaffoldGeneric(scaffold)
|
||
generic_scaffold_smiles = Chem.MolToSmiles(generic_scaffold) if generic_scaffold else "N/A"
|
||
|
||
# Format output
|
||
markdown = f"""## Molecular Scaffold Analysis
|
||
|
||
**Input SMILES:** `{smiles}`
|
||
|
||
### Scaffold Information
|
||
- **Murcko Scaffold SMILES:** `{scaffold_smiles}`
|
||
- **Framework SMILES:** `{framework}`
|
||
- **Generic Scaffold SMILES:** `{generic_scaffold_smiles}`
|
||
|
||
### Scaffold Features
|
||
- **Original Molecule Atom Count:** {mol.GetNumAtoms()}
|
||
- **Scaffold Atom Count:** {scaffold.GetNumAtoms() if scaffold else 0}
|
||
- **Scaffold Ring Count:** {scaffold.GetRingInfo().NumRings() if scaffold else 0}
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
#------------------------------------------------------------------------------
|
||
# Molecular Modification and Conversion
|
||
#------------------------------------------------------------------------------
|
||
|
||
@llm_tool(name="convert_between_chemical_formats_rdkit",
|
||
description="Convert between different chemical structure formats using RDKit library")
|
||
def convert_between_chemical_formats_rdkit(
|
||
input_string: str,
|
||
input_format: str = "smiles",
|
||
output_format: str = "inchi"
|
||
) -> str:
|
||
"""
|
||
Convert between different chemical structure formats.
|
||
|
||
This function converts a chemical structure representation from one format
|
||
to another, supporting SMILES, InChI, InChIKey, and other common formats.
|
||
|
||
Args:
|
||
input_string: The chemical structure string to convert.
|
||
input_format: The format of the input string. Options: "smiles", "inchi",
|
||
"smarts", "mol". Default: "smiles".
|
||
output_format: The desired output format. Options: "smiles", "inchi",
|
||
"inchikey", "canonical_smiles", "mol". Default: "inchi".
|
||
|
||
Returns:
|
||
A formatted Markdown string with the conversion results.
|
||
|
||
Examples:
|
||
>>> convert_between_chemical_formats("CC(=O)OC1=CC=CC=C1C(=O)O", "smiles", "inchi")
|
||
# Returns InChI for Aspirin
|
||
"""
|
||
try:
|
||
# Create molecule based on input format
|
||
mol = None
|
||
|
||
if input_format.lower() == "smiles":
|
||
mol = Chem.MolFromSmiles(input_string)
|
||
elif input_format.lower() == "inchi":
|
||
mol = Chem.MolFromInchi(input_string)
|
||
elif input_format.lower() == "smarts":
|
||
mol = Chem.MolFromSmarts(input_string)
|
||
elif input_format.lower() == "mol":
|
||
mol = Chem.MolFromMolBlock(input_string)
|
||
else:
|
||
return f"Error: Unsupported input format '{input_format}'. Supported formats: smiles, inchi, smarts, mol."
|
||
|
||
if mol is None:
|
||
return f"Error: Could not parse input string as {input_format}."
|
||
|
||
# Convert to output format
|
||
output_string = ""
|
||
|
||
if output_format.lower() == "smiles":
|
||
output_string = Chem.MolToSmiles(mol)
|
||
elif output_format.lower() == "canonical_smiles":
|
||
output_string = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
|
||
elif output_format.lower() == "inchi":
|
||
output_string = Chem.MolToInchi(mol)
|
||
elif output_format.lower() == "inchikey":
|
||
inchi = Chem.MolToInchi(mol)
|
||
output_string = Chem.InchiToInchiKey(inchi)
|
||
elif output_format.lower() == "mol":
|
||
output_string = Chem.MolToMolBlock(mol)
|
||
else:
|
||
return f"Error: Unsupported output format '{output_format}'. Supported formats: smiles, canonical_smiles, inchi, inchikey, mol."
|
||
|
||
# Format output
|
||
markdown = f"""## Chemical Structure Format Conversion
|
||
|
||
**Input ({input_format}):** `{input_string}`
|
||
|
||
**Output ({output_format}):** `{output_string}`
|
||
|
||
### Molecule Information
|
||
- **Formula:** {rdMolDescriptors.CalcMolFormula(mol)}
|
||
- **Molecular Weight:** {Descriptors.ExactMolWt(mol):.4f} g/mol
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="standardize_molecule_rdkit",
|
||
description="Standardize a molecule by normalizing functional groups and charges using RDKit library")
|
||
def standardize_molecule_rdkit(smiles: str) -> str:
|
||
"""
|
||
Standardize a molecule by normalizing functional groups and charges.
|
||
|
||
This function applies a series of standardization rules to a molecule,
|
||
including charge neutralization, tautomer normalization, and functional
|
||
group standardization. It helps ensure consistent representation of
|
||
molecules for comparison and analysis.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the standardization results.
|
||
|
||
Examples:
|
||
>>> standardize_molecule("C[N+](C)(C)CC(=O)[O-]")
|
||
# Returns standardized form of betaine
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Store original SMILES
|
||
original_smiles = Chem.MolToSmiles(mol)
|
||
|
||
# Apply standardization steps
|
||
|
||
# 1. Remove fragments (keep largest fragment)
|
||
frags = Chem.GetMolFrags(mol, asMols=True)
|
||
if len(frags) > 1:
|
||
largest_mol = max(frags, key=lambda x: x.GetNumAtoms())
|
||
mol = largest_mol
|
||
|
||
# Calculate implicit valence for all atoms
|
||
for atom in mol.GetAtoms():
|
||
atom.UpdatePropertyCache(strict=False)
|
||
|
||
# 2. Uncharge molecule (neutralize when possible)
|
||
uncharge_smarts = [
|
||
# Carboxylic acids and similar
|
||
('[$([O-][C,S,P]=O)]', '[OH][C,S,P]=O'),
|
||
# Amines
|
||
('[$([N+][C,c])]', '[N][C,c]'),
|
||
# Nitro groups
|
||
('[$([N+](=O)[O-])]', '[N+](=O)[O-]'),
|
||
# Sulfonic acids
|
||
('[$([S](=O)(=O)[O-])]', '[S](=O)(=O)[OH]')
|
||
]
|
||
|
||
for smarts, replace in uncharge_smarts:
|
||
patt = Chem.MolFromSmarts(smarts)
|
||
if patt and mol.HasSubstructMatch(patt):
|
||
rms = AllChem.ReplaceSubstructs(mol, patt, Chem.MolFromSmarts(replace))
|
||
if rms[0]:
|
||
mol = rms[0]
|
||
# Update property cache after each modification
|
||
for atom in mol.GetAtoms():
|
||
atom.UpdatePropertyCache(strict=False)
|
||
|
||
# 3. Normalize tautomers (simplified approach)
|
||
# This is a complex topic and would require more sophisticated handling
|
||
# for a production environment
|
||
|
||
# 4. Canonicalize SMILES
|
||
standardized_smiles = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
|
||
|
||
# Format output
|
||
markdown = f"""## Molecule Standardization
|
||
|
||
**Input SMILES:** `{original_smiles}`
|
||
**Standardized SMILES:** `{standardized_smiles}`
|
||
|
||
### Applied Standardization Steps
|
||
- Removed fragments (kept largest fragment)
|
||
- Neutralized charges (when possible)
|
||
- SMILES canonicalization
|
||
|
||
### Molecule Information
|
||
- **Formula:** {rdMolDescriptors.CalcMolFormula(mol)}
|
||
- **Molecular Weight:** {Descriptors.ExactMolWt(mol):.4f} g/mol
|
||
- **Formal Charge:** {Chem.GetFormalCharge(mol)}
|
||
"""
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="enumerate_stereoisomers_rdkit",
|
||
description="Enumerate possible stereoisomers of a molecule using RDKit library")
|
||
def enumerate_stereoisomers_rdkit(smiles: str, max_isomers: int = 10) -> str:
|
||
"""
|
||
Enumerate possible stereoisomers of a molecule.
|
||
|
||
This function identifies stereocenters and double bonds with potential
|
||
stereochemistry in a molecule, and generates all possible stereoisomers.
|
||
It's useful for exploring the stereochemical space of a compound.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound. Input SMILES directly
|
||
without any other characters.
|
||
max_isomers: Maximum number of isomers to generate. Default: 10.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the stereoisomer enumeration results.
|
||
|
||
Examples:
|
||
>>> enumerate_stereoisomers("CC(OH)C=CC")
|
||
# Returns stereoisomers of 3-penten-2-ol
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Get original SMILES
|
||
original_smiles = Chem.MolToSmiles(mol)
|
||
|
||
# Find stereocenters and stereobonds
|
||
chiral_centers = Chem.FindMolChiralCenters(mol, includeUnassigned=True)
|
||
|
||
# Count unspecified stereocenters
|
||
unspec_chiral = sum(1 for _, assigned in chiral_centers if not assigned)
|
||
|
||
# Count unspecified stereobonds
|
||
unspec_bonds = 0
|
||
for bond in mol.GetBonds():
|
||
if bond.GetBondType() == Chem.BondType.DOUBLE and bond.GetStereo() == Chem.BondStereo.STEREONONE:
|
||
# Check if this double bond can be stereogenic
|
||
# (not in a ring and both atoms have at least one other heavy atom neighbor)
|
||
if not bond.IsInRing():
|
||
begin_atom = bond.GetBeginAtom()
|
||
end_atom = bond.GetEndAtom()
|
||
if (begin_atom.GetDegree() > 1 and end_atom.GetDegree() > 1):
|
||
unspec_bonds += 1
|
||
|
||
# Calculate total possible isomers
|
||
total_possible = 2 ** (unspec_chiral + unspec_bonds)
|
||
|
||
# Enumerate stereoisomers
|
||
opts = Chem.EnumerateStereoisomers.StereoEnumerationOptions(
|
||
tryEmbedding=True,
|
||
unique=True,
|
||
maxIsomers=max_isomers,
|
||
onlyUnassigned=True
|
||
)
|
||
|
||
isomers = list(Chem.EnumerateStereoisomers.EnumerateStereoisomers(mol, options=opts))
|
||
isomer_smiles = [Chem.MolToSmiles(iso, isomericSmiles=True) for iso in isomers]
|
||
|
||
# Format output
|
||
markdown = f"""## Stereoisomer Enumeration
|
||
|
||
**Input SMILES:** `{original_smiles}`
|
||
|
||
### Stereochemistry Analysis
|
||
- **Number of Chiral Centers:** {len(chiral_centers)}
|
||
- **Unspecified Chiral Centers:** {unspec_chiral}
|
||
- **Unspecified Stereobonds:** {unspec_bonds}
|
||
- **Theoretical Total Possible Stereoisomers:** {total_possible}
|
||
|
||
### Generated Stereoisomers (Maximum {max_isomers})
|
||
"""
|
||
|
||
for i, smi in enumerate(isomer_smiles, 1):
|
||
markdown += f"{i}. `{smi}`\n"
|
||
|
||
if len(isomer_smiles) < total_possible:
|
||
markdown += f"\n**Note:** Only showing {len(isomer_smiles)} stereoisomers out of {total_possible} possible isomers."
|
||
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|
||
|
||
|
||
@llm_tool(name="perform_substructure_search_rdkit",
|
||
description="Search for a substructure pattern in a molecule using RDKit library")
|
||
def perform_substructure_search_rdkit(smiles: str, pattern: str) -> str:
|
||
"""
|
||
Search for a substructure pattern in a molecule.
|
||
|
||
This function searches for a specified substructure pattern (SMARTS or SMILES)
|
||
within a molecule and highlights the matches. It's useful for identifying
|
||
specific structural features or functional groups.
|
||
|
||
Args:
|
||
smiles: SMILES notation of the chemical compound to search in.
|
||
pattern: SMARTS or SMILES pattern to search for.
|
||
|
||
Returns:
|
||
A formatted Markdown string with the substructure search results.
|
||
|
||
Examples:
|
||
>>> perform_substructure_search("CC(=O)OC1=CC=CC=C1C(=O)O", "C(=O)O")
|
||
# Returns matches of carboxylic acid group in Aspirin
|
||
"""
|
||
try:
|
||
# Preprocess input
|
||
smiles = _preprocess_smiles(smiles)
|
||
|
||
# Validate molecule
|
||
mol = _validate_molecule(smiles)
|
||
|
||
# Try to parse pattern as SMARTS first, then as SMILES if that fails
|
||
pattern_mol = Chem.MolFromSmarts(pattern)
|
||
if pattern_mol is None:
|
||
pattern_mol = Chem.MolFromSmiles(pattern)
|
||
if pattern_mol is None:
|
||
return f"Error: Could not parse pattern '{pattern}' as SMARTS or SMILES."
|
||
|
||
# Find matches
|
||
matches = mol.GetSubstructMatches(pattern_mol)
|
||
|
||
# Format output
|
||
markdown = f"""## Substructure Search
|
||
|
||
**Target Molecule SMILES:** `{smiles}`
|
||
**Search Pattern:** `{pattern}`
|
||
|
||
### Search Results
|
||
- **Number of Matches Found:** {len(matches)}
|
||
"""
|
||
|
||
if len(matches) > 0:
|
||
markdown += "\n### Matched Atom Indices\n"
|
||
for i, match in enumerate(matches, 1):
|
||
markdown += f"{i}. Atom indices: {', '.join(str(idx) for idx in match)}\n"
|
||
|
||
return markdown
|
||
|
||
except Exception as e:
|
||
return f"Error: {e}"
|