Source code for meeko.chemtempgen

import gemmi
import json
import pathlib
import copy
import urllib.request
import time
import tempfile
import re
import atexit
import os

from meeko.utils.rdkitutils import covalent_radius

from rdkit import Chem
from rdkit.Chem import rdmolops

from rdkit import RDLogger
import sys, logging

logger = logging.getLogger(__name__)


# Constants
list_of_AD_elements_as_AtomicNum = list(covalent_radius.keys())
metal_AtomicNums = {12, 20, 25, 26, 30}  # Mg: 12, Ca: 20, Mn: 25, Fe: 26, Zn: 30

# Utility Functions

[docs]
def mol_contains_unexpected_element(mol: Chem.Mol, allowed_elements: list[str] = list_of_AD_elements_as_AtomicNum) -> bool:
    """
    Check if mol contains unexpected elements
    
    Parameters
    ---------
    mol : Chem.Mol
        Input molecule
    allowed_elements : list[str]
        List of allowed elements (atomic numbers)
    
    Returns
    -------
    bool
        True if mol contains unexpected elements, False otherwise
    """
    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() not in allowed_elements:
            return True
    return False




[docs]
def get_atom_idx_by_names(mol: Chem.Mol, wanted_names: set[str] = set()) -> set[int]:
    """
    Get atom idx by atom names
    
    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
    wanted_names : set[str]
        Set of atom names to search for
    
    Returns
    -------
    set[int]
        Set of atom indices that match the wanted names
    """
    if not wanted_names:
        return set()
    
    wanted_atoms_by_names = {atom for atom in mol.GetAtoms() if atom.GetProp('atom_id') in wanted_names}
    names_not_found = wanted_names.difference({atom.GetProp('atom_id') for atom in wanted_atoms_by_names})
    if names_not_found: 
        logger.warning(f"Molecule doesn't contain the requested atoms with names: {names_not_found} -> continue with found names... ")
    return {atom.GetIdx() for atom in wanted_atoms_by_names}




[docs]
def get_atom_idx_by_patterns(mol: Chem.Mol, allowed_smarts: str, 
                             wanted_smarts_loc: dict[str, set[int]] = None,
                             allow_multiple: bool=False) -> set[int]:
    """
    Get atom idx by patterns
    
    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
    allowed_smarts : str
        Allowed SMARTS pattern
    wanted_smarts_loc : dict[str, set[int]]
        Dictionary mapping SMARTS patterns to sets of atom indices
    allow_multiple : bool
        Flag to allow multiple matches
    
    Returns
    -------
    set[int]
        Set of atom indices that match the wanted patterns
    """
    if wanted_smarts_loc is None:
        return set()
    
    wanted_atoms_idx = set()
    
    tmol = Chem.MolFromSmarts(allowed_smarts)
    match_allowed = mol.GetSubstructMatches(tmol)
    if not match_allowed:
        logger.warning(f"Molecule doesn't contain allowed_smarts: {allowed_smarts} -> no pattern-based action will be made. ")
        return set()
    if len(match_allowed) > 1 and not allow_multiple: 
        logger.warning(f"Molecule contain multiple copies of allowed_smarts: {allowed_smarts} -> no pattern-based action will be made. ")
        return set()
    if len(match_allowed) > 1 and allow_multiple:
        match_allowed = {item for sublist in match_allowed for item in sublist}
    else:
        match_allowed = match_allowed[0]
    
    atoms_in_mol = [atom for atom in mol.GetAtoms()]
    for wanted_smarts in wanted_smarts_loc: 
        lmol = Chem.MolFromSmarts(wanted_smarts)
        match_wanted = mol.GetSubstructMatches(lmol)
        if not match_wanted:
            logger.warning(f"Molecule doesn't contain wanted_smarts: {wanted_smarts} -> continue with next pattern... ")
            continue
        for match_copy in match_wanted:
            match_in_copy = (idx for idx in match_copy if match_copy.index(idx) in wanted_smarts_loc[wanted_smarts])
            match_wanted_atoms = {atoms_in_mol[idx] for idx in match_in_copy if idx in match_allowed}
            if match_wanted_atoms: 
                wanted_atoms_idx.update(atom.GetIdx() for atom in match_wanted_atoms)
    
    return wanted_atoms_idx



# Mol Editing Functions

[docs]
def embed(mol: Chem.Mol, allowed_smarts: str, 
          leaving_names: set[str] = None, leaving_smarts_loc: dict[str, set[int]] = None, 
          alsoHs: bool = True) -> Chem.Mol:
    """
    Remove atoms from the molecule based the union of
    (a) leaving_names: list of atom IDs (names), and
    (b) leaving_smarts_loc: dict to map substructure SMARTS patterns with 
    tuple of 0-based indicies for atoms to delete (restricted by allowed_smarts)

    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
    allowed_smarts : str
        Allowed SMARTS pattern
    leaving_names : set[str]
        Set of atom names to remove
    leaving_smarts_loc : dict[str, set[int]]
        Dictionary mapping SMARTS patterns to sets of atom indices
    alsoHs : bool
        Flag to also remove H atoms bonded to the leaving atoms
    
    Returns
    -------
    Chem.Mol
        Modified molecule with specified atoms removed
    """
    if leaving_names is None and leaving_smarts_loc is None:
        return mol

    leaving_atoms_idx = set()

    if leaving_names:
        leaving_atoms_idx.update(get_atom_idx_by_names(mol, leaving_names))

    if leaving_smarts_loc:
        leaving_atoms_idx.update(get_atom_idx_by_patterns(mol, allowed_smarts, leaving_smarts_loc))

    if leaving_atoms_idx and alsoHs:
        atoms_in_mol = [atom for atom in mol.GetAtoms()]
        leaving_Hs = (ne for atom_idx in leaving_atoms_idx.copy() for ne in atoms_in_mol[atom_idx].GetNeighbors() if ne.GetAtomicNum() == 1)
        leaving_atoms_idx.update(atom.GetIdx() for atom in leaving_Hs)

    if not leaving_atoms_idx:
        logger.warning(f"No matched atoms to delete -> embed returning original mol...")
        return mol
    
    rwmol = Chem.RWMol(mol)
    for atom_idx in sorted(leaving_atoms_idx, reverse=True): 
        rwmol.RemoveAtom(atom_idx)
    rwmol.UpdatePropertyCache()
    return rwmol.GetMol()




[docs]
def cap(mol: Chem.Mol, allowed_smarts: str, 
        capping_names: set[str] = None, capping_smarts_loc: dict[str, set[int]] = None, 
        protonate: bool = False) -> Chem.Mol:
    """
    Add hydrogens to atoms with implicit hydrogens based on the union of
    (a) capping_names: list of atom IDs (names), and
    (b) capping_smarts_loc: dict to map substructure SMARTS patterns with 
    tuple of 0-based indicies for atoms.
    
    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
    allowed_smarts : str
        Allowed SMARTS pattern
    capping_names : set[str]
        Set of atom names to cap
    capping_smarts_loc : dict[str, set[int]]
        Dictionary mapping SMARTS patterns to sets of atom indices
    protonate : bool
        Flag to add a proton to the atom's formal charge
    
    Returns
    -------
    Chem.Mol
        Modified molecule with specified atoms capped
    """
    if capping_names is None and capping_smarts_loc is None:
        return mol
    
    capping_atoms_idx = set()
    
    if capping_names:
        capping_atoms_idx.update(get_atom_idx_by_names(mol, capping_names))

    if capping_smarts_loc:
        capping_atoms_idx.update(get_atom_idx_by_patterns(mol, allowed_smarts, capping_smarts_loc, allow_multiple = True))

    if not capping_atoms_idx:
        logger.warning(f"No matched atoms to cap -> cap returning original mol...")
        return mol
    
    def get_max_Hid(mol: Chem.Mol) -> int:
        all_Hids = (atom.GetProp('atom_id') for atom in mol.GetAtoms() if atom.GetAtomicNum()==1)
        regular_ids = {Hid for Hid in all_Hids if Hid[0]=='H' and Hid[1:].isdigit()}
        if len(regular_ids) > 0:
            return max(int(x[1:]) for x in regular_ids)
        else:
            return 0
    
    rwmol = Chem.RWMol(mol)
    new_Hid = get_max_Hid(mol) + 1
    atoms_in_mol = [atom for atom in mol.GetAtoms()]
    for atom_idx in capping_atoms_idx:
        parent_atom  = atoms_in_mol[atom_idx]
        if protonate: 
            parent_atom.SetFormalCharge(parent_atom.GetFormalCharge() + 1)
        needed_Hs = parent_atom.GetNumImplicitHs()
        if needed_Hs == 0:
            logger.warning(f"Atom # {atom_idx} ({parent_atom.GetProp('atom_id')}) in mol doesn't have implicit Hs -> continue with next atom... ")
        else:
            new_atom = Chem.Atom("H")
            new_atom.SetProp('atom_id', f"H{new_Hid}")
            new_Hid += 1
            new_idx = rwmol.AddAtom(new_atom)
            rwmol.AddBond(atom_idx, new_idx, Chem.BondType.SINGLE)
    rwmol.UpdatePropertyCache()
    return rwmol.GetMol()




[docs]
def deprotonate(mol: Chem.Mol, acidic_proton_loc: dict[str, int] = None) -> Chem.Mol:
    """
    Remove acidic protons from the molecule based on acidic_proton_loc
    
    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
    acidic_proton_loc : dict[str, int]
        Dictionary mapping SMARTS patterns to indices of acidic protons to remove
        keys: smarts pattern of a fragment
        value: the index (order in smarts) of the leaving proton
    
    Returns
    -------
    Chem.Mol
        Modified molecule with specified protons removed
    """
    if acidic_proton_loc is None:
        return mol

    # deprotonate all matched protons
    acidic_protons_idx = set()
    for smarts_pattern, idx in acidic_proton_loc.items():
        qmol = Chem.MolFromSmarts(smarts_pattern)
        acidic_protons_idx.update(match[idx] for match in mol.GetSubstructMatches(qmol))
    
    if not acidic_protons_idx:
        logger.warning(f"Molecule doesn't contain matching atoms for acidic_proton_loc:" + 
                        f"{acidic_proton_loc}" + 
                        f"-> deprotonate returning original mol... ")
        return mol
    
    rwmol = Chem.RWMol(mol)
    for atom_idx in sorted(acidic_protons_idx, reverse=True):
        rwmol.RemoveAtom(atom_idx)
        neighbors = mol.GetAtomWithIdx(atom_idx).GetNeighbors()
        if neighbors:
            neighbor_atom = rwmol.GetAtomWithIdx(neighbors[0].GetIdx())
            neighbor_atom.SetFormalCharge(neighbor_atom.GetFormalCharge() - 1)
    
    rwmol.UpdatePropertyCache()
    return rwmol.GetMol()




[docs]
def recharge(rwmol: Chem.RWMol) -> Chem.RWMol: 
    """
    Recharges the molecule by adjusting the formal charges of metal complexes and their coordinated atoms

    Parameters
    ----------
    rwmol : Chem.RWMol
        Input molecule
    
    Returns
    -------
    Chem.RWMol
        Modified molecule with adjusted formal charges
    """
    # Recharging metal complexes
    metal_atoms = set(atom for atom in rwmol.GetAtoms() if atom.GetAtomicNum() in metal_AtomicNums)
    coord_valence = {v: k for k, v_set in {
            1: {1, 9, 17, 35, 53}, # monovalent: H, halogens
            2: {8, 16}, # divalent: O, S
            3: {5, 7, 15}, # trivalent: B, N, P (phosphine)
            4: {6, 14}, # tetravalent: C, Si
            }.items() for v in v_set}
    bond_order_mapping = {
        Chem.BondType.SINGLE: 1,
        Chem.BondType.DOUBLE: 2,
        Chem.BondType.TRIPLE: 3,
        Chem.BondType.AROMATIC: 1.5  
    }

    if not metal_atoms: 
        return rwmol
    else:
        # Method a: If charge is ignored at metal center
        # -> charge up nonmetal coordinated atoms 
        # (metal ox state will be interpreted from valence of nonmetal coordinated atom)
        metal_to_nonmetal_neighbors = {metal: {atom for atom in metal.GetNeighbors() 
                                               if atom.GetAtomicNum() not in metal_AtomicNums} for metal in metal_atoms}
        nonmetal_coord_atoms = set(atom for v_set in metal_to_nonmetal_neighbors.values() for atom in v_set)

        if any(atom.GetFormalCharge() == 0 for atom in metal_atoms): 
            logger.warning(f"Molecule contains metal with unspecified charge state -> charging nonmetal coordinated atoms...")
            logger.warning(f"All metals will be neutralized and the nonmetal coordinated atoms will be charged according to their explicit valence. ")
            for atom in metal_atoms: 
                atom.SetFormalCharge(0)
            for atom in nonmetal_coord_atoms: 
                bond_sum = sum(bond_order_mapping.get(bond.GetBondType(), 0) for bond in atom.GetBonds())
                atom.SetFormalCharge(bond_sum - coord_valence[atom.GetAtomicNum()])
  
        # Method b: If charge (ox) state is specified for all metal centers
        # -> ionize (break bonds w/) and charge down nonmetal coordinated atoms
        # (metal ox state will be from input charge state)
        else: 
            logger.warning(f"Molecule contains metal specified charge state -> ionizing metal-nonmetal coordinate bonds...")
            logger.warning(f"All metal-nonmetal coordinate bonds will be polarized. ")
            metal_bonds = {bond.GetIdx(): (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) 
                               for metal in metal_atoms for bond in metal.GetBonds()}

            for bond_idx in sorted(metal_bonds, reverse=True):
                rwmol.RemoveBond(metal_bonds[bond_idx][0], metal_bonds[bond_idx][1])

            for atom in nonmetal_coord_atoms: 
                bond_sum = sum(bond_order_mapping.get(bond.GetBondType(), 0) for bond in atom.GetBonds())
                atom.SetFormalCharge(bond_sum - coord_valence[atom.GetAtomicNum()])

            # to avoid fragmentation, rebuild all metal-nonmetal bonds
            for metal in metal_atoms: 
                for nei in metal_to_nonmetal_neighbors[metal]:
                    rwmol.AddBond(metal.GetIdx(), nei.GetIdx(), Chem.BondType.SINGLE)
                    metal.SetFormalCharge(metal.GetFormalCharge() - 1)
                    nei.SetFormalCharge(nei.GetFormalCharge() + 1)

        logger.warning(f"Total charge of the molecule after recharging: {sum(atom.GetFormalCharge() for atom in rwmol.GetAtoms())}")

    return rwmol



# Attribute Formatters

[docs]
def get_smiles_with_atom_names(mol: Chem.Mol) -> tuple[str, list[str]]:
    """
    Generate SMILES with atom names in the order of SMILES output
    
    Parameters
    ----------
    mol : Chem.Mol
        Input molecule
        
    Returns
    -------
    tuple[str, list[str]]
        Tuple containing the SMILES string and a list of atom names in the order of SMILES output
    """
    # allHsExplicit may expose the implicit Hs of linker atoms to Smiles; the implicit Hs don't have names
    smiles_exh = Chem.MolToSmiles(mol, allHsExplicit=True)

    smiles_atom_output_order = mol.GetProp('_smilesAtomOutputOrder')
    delimiters = ('[', ']', ',')
    for delimiter in delimiters:
        smiles_atom_output_order = smiles_atom_output_order.replace(delimiter, ' ')
    smiles_output_order = (int(x) for x in smiles_atom_output_order.split())

    atom_id_dict = {atom.GetIdx(): atom.GetProp('atom_id') for atom in mol.GetAtoms()}
    atom_name = [atom_id_dict[atom_i] for atom_i in smiles_output_order]

    return smiles_exh, atom_name




[docs]
def get_pretty_smiles(smi: str) -> str: 
    """
    Convert Smiles with allHsExplicit to pretty Smiles to be put on chem templates
    
    Parameters
    ----------
    smi : str
        Input SMILES string
        
    Returns
    -------
    str
        Pretty SMILES string with implicit Hs removed
    """
    # collect the inside square brackets contents
    contents = set(re.findall(r'\[([^\]]+)\]', smi))

    def is_nonmetal_element(symbol: str) -> bool:
        """Check if a string represents a valid chemical element."""
        try:
            return Chem.GetPeriodicTable().GetAtomicNumber(symbol) not in metal_AtomicNums
        # rdkit throws RuntimeError if invalid
        except RuntimeError:
            return False

    for content in contents:
        # keep [H] for explicit Hs
        if content == 'H': 
            continue
        # drop H in the content to hide implicit Hs
        H_stripped = content.split('H')[0]
        # drop [ ] if the content is an uncharged nonmetal element symbol 
        if is_nonmetal_element(content) or is_nonmetal_element(H_stripped):
            smi = smi.replace(f"[{content}]", f"{H_stripped}" if 'H' in content else f"{content}")
    return smi



[docs]
class ChemicalComponent_LoggingControler:
    """Context manager to control logging level for RDKit and Meeko during the creation of chemical components."""

    def __init__(self):
        """
        Initialize the logging controller.
        
        Sets the original logging level for RDKit and Meeko, and prepares a stream handler.
        """
        self.original_level = logger.level
        self.rdkit_logger = RDLogger.logger()
        self.default_rdkit_level = RDLogger.WARNING
        self.handler = None

    def __enter__(self):
        """Enter the context manager, setting the logging levels."""
        self.rdkit_logger.setLevel(RDLogger.CRITICAL)
        logger.setLevel(logging.WARNING)
        self.handler = logging.StreamHandler(sys.stdout)
        logger.addHandler(self.handler)
        return self
    
    def __exit__(self, exc_type, exc_value, traceback):
        """Exit the context manager, restoring the original logging levels."""
        self.rdkit_logger.setLevel(self.default_rdkit_level)
        logger.setLevel(self.original_level)
        logger.removeHandler(self.handler)




[docs]
class ChemicalComponent:
    """
    Class representing a chemical component with its properties and methods for manipulation.
    
    Attributes
    ----------
    rdkit_mol : Chem.Mol
        The RDKit molecule object representing the chemical component.
    resname : str
        The residue name of the chemical component.
    parent : str
        The name of its parent chemical component (default is the same as resname).
    smiles_exh : str
        The SMILES representation of the chemical component with explicit hydrogens.
    atom_name : list[str]
        A list of atom names in the order of SMILES output.
    link_labels : dict
        A dictionary mapping atom indices to link labels (default is empty).
    """
    
    def __init__(self, rdkit_mol: Chem.Mol, resname: str, smiles_exh: str, atom_name: list[str]):
        """
        Initialize a ChemicalComponent instance.
        
        Parameters
        ----------
        rdkit_mol : Chem.Mol
            The RDKit molecule object representing the chemical component.
        resname : str
            The residue name of the chemical component.
        smiles_exh : str
            The SMILES representation of the chemical component with explicit hydrogens.
        atom_name : list[str]
            A list of atom names in the order of SMILES output.
        """
        self.rdkit_mol = rdkit_mol
        self.resname = resname
        self.parent = resname # default parent to itself
        self.smiles_exh = smiles_exh
        self.atom_name = atom_name
        self.link_labels = {} # default to empty dict (free molecular form)

    def __eq__(self, other):
        """Check equality of two ChemicalComponent instances based on their SMILES and atom names."""
        if isinstance(other, ChemicalComponent):
            return self.smiles_exh == other.smiles_exh and self.atom_name == other.atom_name
        return False


[docs]
    @classmethod
    def from_cif(cls, source_cif: str, resname: str):
        """
        Create ChemicalComponent from a chemical component dict file and a searchable residue name in file.
        
        Parameters
        ----------
        source_cif : str
            Path to the CIF file containing the chemical component data.
        resname : str
            The residue name to search for in the CIF file.
        
        Returns
        -------
        ChemicalComponent
            An instance of the ChemicalComponent class.
        """
        # Locate block by resname
        doc = gemmi.cif.read(source_cif)
        block = doc.find_block(resname)
        
        # Populate atom table
        atom_category = '_chem_comp_atom.'
        atom_attributes = ['atom_id', # atom names
                           'type_symbol', # atom elements
                           'charge', # (atomic) formal charges
                           'pdbx_leaving_atom_flag', # flags for leaving atoms after polymerization
                           ]
        atom_table = block.find(atom_category, atom_attributes)
        atom_cols = {attr: atom_table.find_column(f"{atom_category}{attr}") for attr in atom_attributes}

        # Summon rdkit atoms into empty RWMol
        rwmol = Chem.RWMol()
        atom_elements = atom_cols['type_symbol']

        for idx, element in enumerate(atom_elements):
            if len(element)==2:
                element = element[0] + element[1].lower()
            rdkit_atom = Chem.Atom(element)
            for attr in atom_attributes:
                rdkit_atom.SetProp(attr, atom_cols[attr][idx])
                # strip double quotes in names
                raw_name = rdkit_atom.GetProp('atom_id')
                rdkit_atom.SetProp('atom_id', raw_name.strip('"'))
            target_charge = atom_cols['charge'][idx]
            if target_charge!='0':
                rdkit_atom.SetFormalCharge(int(target_charge)) # this needs to be int for rdkit
            rwmol.AddAtom(rdkit_atom)

        # Check if rwmol contains unexpected elements
        if mol_contains_unexpected_element(rwmol):
            logger.warning(f"Molecule contains unexpected elements -> template for {resname} will be None. ")
            return None

        # Map atom_id (atom names) with rdkit idx
        name_to_idx_mapping = {atom.GetProp('atom_id'): idx for (idx, atom) in enumerate(rwmol.GetAtoms())}

        # Populate bond table
        bond_category = '_chem_comp_bond.'
        bond_attributes = ['atom_id_1', # atom name 1
                           'atom_id_2', # atom name 2
                           'value_order', # bond order
                           ]
        bond_table = block.find(bond_category, bond_attributes)
        
        # If bond table is empty (single-atom residues), skip bond creation
        if bond_table: 
            bond_cols = {attr: bond_table.find_column(f"{bond_category}{attr}") for attr in bond_attributes}

            # Connect atoms by bonds
            bond_type_mapping = {
                'SING': Chem.BondType.SINGLE,
                'DOUB': Chem.BondType.DOUBLE,
                'TRIP': Chem.BondType.TRIPLE,
                'AROM': Chem.BondType.AROMATIC
            }
            bond_types = bond_cols['value_order']

            for bond_i, bond_type in enumerate(bond_types):
                rwmol.AddBond(name_to_idx_mapping[bond_cols['atom_id_1'][bond_i].strip('"')], 
                            name_to_idx_mapping[bond_cols['atom_id_2'][bond_i].strip('"')], 
                            bond_type_mapping.get(bond_type, Chem.BondType.UNSPECIFIED))

            # Try recharging mol (for metals)
            try:
                rwmol = recharge(rwmol)
            except Exception as e:
                logger.error(f"Failed to recharge rdkitmol. Error: {e} -> template for {resname} will be None. ")
                return None

        # Finish eidting mol 
        try:    
            rwmol.UpdatePropertyCache()
        except Exception as e:
            logger.error(f"Failed to create rdkitmol from cif. Error: {e} -> template for {resname} will be None. ")
            return None
        
        # Check implicit Hs
        total_implicit_hydrogens = sum(atom.GetNumImplicitHs() for atom in rwmol.GetAtoms())
        if total_implicit_hydrogens > 0:
            logger.error(f"rdkitmol from cif has implicit hydrogens. -> template for {resname} will be None. ")
            return None

        rdkit_mol = rwmol.GetMol()
            
        # Get Smiles with explicit Hs and ordered atom names
        smiles_exh, atom_name = get_smiles_with_atom_names(rdkit_mol)
        
        return cls(rdkit_mol, resname, smiles_exh, atom_name)




[docs]
    def make_canonical(self, acidic_proton_loc):
        """
        Deprotonate acidic groups til the canonical (most deprotonated) state.
        
        Parameters
        ----------
        acidic_proton_loc : dict[str, int]
            Dictionary mapping SMARTS patterns to indices of acidic protons to remove
            keys: smarts pattern of a fragment
            value: the index (order in smarts) of the leaving proton
        
        Returns
        -------
        ChemicalComponent
            The modified ChemicalComponent instance with the canonicalized molecule.
        """
        self.rdkit_mol = deprotonate(self.rdkit_mol, acidic_proton_loc = acidic_proton_loc)
        return self



[docs]
    def make_embedded(self, allowed_smarts, leaving_names = None, leaving_smarts_loc = None):
        """
        Remove leaving atoms from the molecule by atom names and/or patterns.
        
        Parameters
        ----------
        allowed_smarts : str
            Allowed SMARTS pattern
        leaving_names : set[str]
            Set of atom names to remove
        leaving_smarts_loc : dict[str, set[int]]
            Dictionary mapping SMARTS patterns to sets of atom indices
        
        Returns
        -------
        ChemicalComponent
            The modified ChemicalComponent instance with the embedded molecule.
        """
        self.rdkit_mol = embed(self.rdkit_mol, allowed_smarts = allowed_smarts, 
                               leaving_names = leaving_names, leaving_smarts_loc = leaving_smarts_loc)
        return self

        

[docs]
    def make_capped(self, allowed_smarts, capping_names = None, capping_smarts_loc = None, protonate = None):
        """
        Build and name explicit hydrogens for atoms with implicit Hs by atom names and/or patterns.
        
        Parameters
        ----------
        allowed_smarts : str
            Allowed SMARTS pattern
        capping_names : set[str]
            Set of atom names to cap
        capping_smarts_loc : dict[str, set[int]]
            Dictionary mapping SMARTS patterns to sets of atom indices
        protonate : bool
            Flag to add a proton to the atom's formal charge

        Returns
        -------
        ChemicalComponent
            The modified ChemicalComponent instance with the capped molecule.
        """
        self.rdkit_mol = cap(self.rdkit_mol, allowed_smarts = allowed_smarts, 
                             capping_names = capping_names, capping_smarts_loc = capping_smarts_loc,
                             protonate = protonate)
        return self

        

[docs]
    def make_pretty_smiles(self):
        """Build and name explicit hydrogens for atoms with implicit Hs by atom names and/or patterns."""
        self.smiles_exh, self.atom_name = get_smiles_with_atom_names(self.rdkit_mol)
        self.smiles_exh = get_pretty_smiles(self.smiles_exh)
        return self



[docs]
    def make_link_labels_from_patterns(self, pattern_to_label_mapping = {}):
        """
        Map patterns to link labels based on a given mapping.
        
        Parameters
        ----------
        pattern_to_label_mapping : dict[str, str]
            Dictionary mapping patterns to link labels
            keys: pattern to search for in the molecule
            values: label to assign to the atom matching the pattern
        """
        if not pattern_to_label_mapping:
            return self

        for pattern in pattern_to_label_mapping:
            atom_idx = get_atom_idx_by_patterns(self.rdkit_mol, allowed_smarts = Chem.MolToSmarts(self.rdkit_mol), 
                                                wanted_smarts_loc = {pattern: {0}})
            atoms_in_mol = [atom for atom in self.rdkit_mol.GetAtoms()]
            if not atom_idx:
                logger.warning(f"Molecule doesn't contain pattern: {pattern} -> linker label for {pattern_to_label_mapping[pattern]} will not be made. ")
            elif len(atom_idx) > 1:
                logger.warning(f"Molecule contain multiple copies of pattern: {pattern} -> linker label for {pattern_to_label_mapping[pattern]} will not be made. ")
            else:
                atom_idx = next(iter(atom_idx))
                name = atoms_in_mol[atom_idx].GetProp('atom_id')
                self.link_labels.update({str(self.atom_name.index(name)): pattern_to_label_mapping[pattern]})

        return self

    

[docs]
    def make_link_labels_from_names(self, name_to_label_mapping = {}):
        """
        Map atom names to link labels based on a given mapping.

        Parameters
        ----------
        name_to_label_mapping : dict[str, str]
            Dictionary mapping atom names to link labels
            keys: atom names to search for in the molecule
            values: label to assign to the atom matching the name

        Returns
        -------
        ChemicalComponent
            The modified ChemicalComponent instance with updated link labels.
        """
        if not name_to_label_mapping:
            return self

        for atom in self.rdkit_mol.GetAtoms():
            if atom.GetProp('atom_id') in name_to_label_mapping:
                if atom.GetNumImplicitHs() > 0:
                    name = atom.GetProp('atom_id')
                    self.link_labels.update({str(self.atom_name.index(name)): name_to_label_mapping[name]})

        return self

    

[docs]
    def ResidueTemplate_check(self) -> bool:
        """
        Check if the chemical component is a valid residue template.

        This is the same ResidueTemplate.check from meeko.polymer
        """ 
        ps = Chem.SmilesParserParams()
        ps.removeHs = False
        mol = Chem.MolFromSmiles(self.smiles_exh, ps)
        have_implicit_hs = {atom.GetIdx() for atom in mol.GetAtoms() if atom.GetTotalNumHs() > 0}
        
        if self.link_labels: 
            int_labels = {int(atom_id) for atom_id in self.link_labels}
        else:
            int_labels = set()

        if int_labels != have_implicit_hs:
            raise ValueError(
                f"expected any atom with non-real Hs ({have_implicit_hs}) to tagged in link_labels ({int_labels})"
            )
        
        if not self.atom_name: 
            return
        
        if len(self.atom_name) != mol.GetNumAtoms():
            raise ValueError(f"{len(self.atom_name)=} differs from {mol.GetNumAtoms()=}")
        return




# Export/Writer Function

[docs]
def export_chem_templates_to_json(cc_list: list[ChemicalComponent], json_fname: str=""):
    """
    Export list of chem templates to json
    
    Parameters
    ----------
    cc_list : list[ChemicalComponent]
        List of ChemicalComponent instances to export
    json_fname : str
        Filename for the output JSON file (default is empty string, which prints to console)
        
    Returns
    -------
    str
        JSON string representation of the chemical templates
    """
    basenames = [cc.parent for cc in cc_list]
    ambiguous_dict = {basename: [] for basename in basenames}
    for cc in cc_list:
        if cc.resname not in ambiguous_dict[cc.parent]:
            ambiguous_dict[cc.parent].append(cc.resname)

    data_to_export = {"ambiguous": {basename: basename+".ambiguous" for basename in basenames}}

    residue_templates = {}
    for cc in cc_list:
        residue_templates[cc.resname] = {
            "smiles": cc.smiles_exh,
            "atom_name": cc.resname+".atom_names",
        }
        if cc.link_labels:
            residue_templates[cc.resname]["link_labels"] = cc.resname+".link_labels"
        else:
            residue_templates[cc.resname]["link_labels"] = {}

    data_to_export.update({"residue_templates": residue_templates})

    json_str = json.dumps(data_to_export, indent = 4)

    # format ambiguous resnames to one line
    for basename in data_to_export["ambiguous"]:
        single_line_resnames = json.dumps(ambiguous_dict[basename], separators=(', ', ': '))
        json_str = json_str.replace(json.dumps(data_to_export["ambiguous"][basename], indent = 4), single_line_resnames)

    # format link_labels and atom_name to one line
    for cc in cc_list:
        single_line_atom_name = json.dumps(cc.atom_name, separators=(', ', ': '))
        json_str = json_str.replace(json.dumps(data_to_export["residue_templates"][cc.resname]["atom_name"], indent = 4), single_line_atom_name)
        if cc.link_labels:
            single_line_link_labels = json.dumps(cc.link_labels, separators=(', ', ': '))
            json_str = json_str.replace(json.dumps(data_to_export["residue_templates"][cc.resname]["link_labels"], indent = 4), single_line_link_labels)

    if json_fname:
        with open(pathlib.Path(json_fname), 'w') as f:
            f.write(json_str)
        logger.info(f"{json_fname} <-- Json File for New Chemical Templates")
    else:
        logger.info(" New Template Built ".center(60, "*"))
        logger.info(json_str)
        logger.info("*"*60)
        return json_str




[docs]
def fetch_from_pdb(resname: str, max_retries = 5, backoff_factor = 2) -> str: 
    """
    Fetch a CIF file from the RCSB PDB website for a given residue name.

    Parameters
    ----------
    resname : str
        The residue name to fetch from the PDB.
    max_retries : int
        Maximum number of retries for downloading the file (default is 5).
    backoff_factor : int
        Factor to increase the wait time between retries (default is 2).
    
    Returns
    -------
    str
        Path to the downloaded CIF file.

    Raises
    -------
    RuntimeError
        If the ligand is not available from rcsb.org or if the maximum number of retries is reached.
    """
    url = f"https://files.rcsb.org/ligands/download/{resname}.cif"
    file_path = f"{resname}.cif"
    for retry in range(max_retries):
        try:
            with urllib.request.urlopen(url) as response:
                content = response.read()
            logger.info(f"File downloaded successfully: {file_path}")
            with tempfile.NamedTemporaryFile(delete=False, suffix=".cif") as temp_file:
                temp_file.write(content)
            atexit.register(
                lambda: os.remove(temp_file.name) 
                if temp_file.name and os.path.exists(temp_file.name) 
                else None
            )
            return temp_file.name 
            
        except Exception as e:
            if isinstance(e, urllib.error.HTTPError) and e.getcode() == 404:
                raise RuntimeError(f"Ligand {resname} not available from rcsb.org") from e
            elif retry < max_retries - 1: 
                wait_time = backoff_factor ** retry  
                logger.info(f"Download failed for {resname}. Error: {e}. Retrying in {wait_time} seconds...")
                time.sleep(wait_time)
            else:
                err = f"Max retries reached. Could not download CIF file for {resname}. Error: {e}"
                raise RuntimeError(err) from e


# Default chemical groups for deprotonate
acidic_proton_loc_canonical = {
        # any carboxylic acid, sulfuric/sulfonic acid/ester, phosphoric/phosphinic acid/ester
        '[H][O]['+atom+'](=O)': 0 for atom in ('CX3', 'SX4', 'SX3', 'PX4', 'PX3')
    } | {
        # any thio carboxylic/sulfuric acid
        '[H][O]['+atom+'](=S)': 0 for atom in ('CX3', 'SX4')
    } | {
        '[H][SX2][a]': 0, # thiophenol
    }

# Standard pipelines

[docs]
def build_noncovalent_CC(basename: str) -> ChemicalComponent: 
    """
    Build a noncovalent chemical component from a CIF file.

    Parameters
    ----------
    basename : str
        The name of the chemical component to build.
    
    Returns
    -------
    ChemicalComponent
        The constructed ChemicalComponent instance.
    """
    with ChemicalComponent_LoggingControler(): 
        cc_from_cif = ChemicalComponent.from_cif(fetch_from_pdb(basename), basename)
        if cc_from_cif is None:
            return None

        cc = copy.deepcopy(cc_from_cif)
        logger.info(f"*** using CCD ligand {basename} to construct residue {cc.resname} ***")

        cc = cc.make_canonical(acidic_proton_loc = acidic_proton_loc_canonical)
        if len(rdmolops.GetMolFrags(cc.rdkit_mol))>1:
            err = f"Template Generation failed for {cc.resname}. Error: Molecule breaks into fragments during the deleterious editing. "
            raise RuntimeError(err)

        cc = cc.make_pretty_smiles()

        # Check
        try:
            cc.ResidueTemplate_check()
        except Exception as e:
            err = f"Template {cc.resname} Failed to pass ResidueTemplate check. Error: {e}"
            raise RuntimeError(err)
            
        logger.info(f"*** finish making {cc.resname} ***")
    return cc




[docs]
def add_variants(cc_orig: ChemicalComponent, cc_list: list[ChemicalComponent] = [], 
                 embed_allowed_smarts: str = None, 
                 cap_allowed_smarts: str = None, cap_protonate: bool = False, 
                 pattern_to_label_mapping_standard = dict[str, str], 
                 variant_dict = dict[str, tuple]) -> list[ChemicalComponent]:
    """
    Add variants to a chemical component based on the provided variant dictionary.

    Parameters
    ----------
    cc_orig : ChemicalComponent
        The original chemical component to which variants will be added.
    cc_list : list[ChemicalComponent]
        List of existing chemical components to check for redundancy (default is empty list).
    embed_allowed_smarts : str
        Allowed SMARTS pattern for embedding (default is None).
    cap_allowed_smarts : str
        Allowed SMARTS pattern for capping (default is None).
    cap_protonate : bool
        Flag to add a proton to the atom's formal charge (default is False).
    pattern_to_label_mapping_standard : dict[str, str]
        Dictionary mapping patterns to link labels (default is empty dictionary).
    variant_dict : dict[str, tuple]
        Dictionary mapping suffixes to tuples of embedding and capping SMARTS patterns.
        keys: suffixes for the variants
        values: tuples containing the embedding and capping SMARTS patterns (embedding smarts, capping smarts)
    
    Returns
    -------
    list[ChemicalComponent]
        List of ChemicalComponent instances with the added variants.
    """
    for suffix in variant_dict:
        cc = copy.deepcopy(cc_orig)
        cc.resname += suffix
        logger.info(f"*** using CCD residue {cc.parent} to construct {cc.resname} ***")

        cc = (
            cc
            .make_canonical(acidic_proton_loc = acidic_proton_loc_canonical) 
            .make_embedded(allowed_smarts = embed_allowed_smarts, 
                        leaving_smarts_loc = variant_dict[suffix][0])
            )
        if len(rdmolops.GetMolFrags(cc.rdkit_mol))>1:
            logger.warning(f"Molecule breaks into fragments during the deleterious editing of {cc.resname} -> skipping the vaiant... ")
            continue

        cc = (
            cc
            .make_capped(allowed_smarts = cap_allowed_smarts, 
                        capping_smarts_loc = variant_dict[suffix][1],
                        protonate = cap_protonate) 
            .make_pretty_smiles()
            .make_link_labels_from_patterns(pattern_to_label_mapping = pattern_to_label_mapping_standard)
            )

        try:
            cc.ResidueTemplate_check()
        except Exception as e:
            err = f"Template {cc.resname} Failed to pass ResidueTemplate check. Error: {e}"
            logger.error(err)
            continue
            
        # Check redundancy
        if any(cc == other_variant for other_variant in cc_list):
            logger.error(f"Template Failed to pass redundancy check -> skipping the template... ")
            continue

        cc_list.append(cc)
        logger.info(f"*** finish making {cc.resname} ***")

    return cc_list


# Default recipes

[docs]
class AA_recipe: 
    """Class representing the recipe for amino acids"""

    embed_allowed_smarts = "[NX3]([H])([H])[CX4][CX3](=O)[O]"
    cap_allowed_smarts = "[NX3][CX4][CX3](=O)"
    cap_protonate = True
    pattern_to_label_mapping_standard = {'[NX3h1]': 'N-term', '[CX3h1]': 'C-term'}

    variant_dict = {
            "":  ({"[NX3]([H])([H])[CX4][CX3](=O)[O]": {1, 6}}, None), # embedded amino acid
            "_N": ({"[NX3]([H])([H])[CX4][CX3](=O)[O]": {6}}, {"[NX3][CX4][CX3](=O)": {0}}), # N-term amino acid
            "_C": ({"[NX3]([H])([H])[CX4][CX3](=O)[O]": {1}}, None), # C-term amino acid
        }



[docs]
class NA_recipe: 
    """Class representing the recipe for nucleic acids"""

    embed_allowed_smarts = "[O][PX4](=O)([O])[OX2][CX4][CX4]1[OX2][CX4][CX4][CX4]1[OX2][H]"
    cap_allowed_smarts = "[OX2][CX4][CX4]1[OX2][CX4][CX4][CX4]1[OX2]"
    cap_protonate = False
    pattern_to_label_mapping_standard = {'[PX4h1]': '5-prime', '[O+0X2h1]': '3-prime'}
    variant_dict = {
            "_":  ({"[O][PX4](=O)([O])[OX2][CX4]": {0} ,"[CX4]1[OX2][CX4][CX4][CX4]1[OX2][H]": {6}}, None), # embedded nucleotide 
            "_3": ({"[O][PX4](=O)([O])[OX2][CX4]": {0}}, None), # 3' end nucleotide 
            "_5p": ({"[CX4]1[OX2][CX4][CX4][CX4]1[OX2][H]": {6}}, None), # 5' end nucleotide (extra phosphate than canonical X5)
            "_5": ({"[O][PX4](=O)([O])[OX2][CX4]": {0,1,2,3}, "[CX4]1[OX2][CX4][CX4][CX4]1[OX2][H]": {6}}, {"[OX2][CX4][CX4]1[OX2][CX4][CX4][CX4]1[OX2]": {0}}), # 5' end nucleoside (canonical X5 in Amber)
        }



[docs]
def build_linked_CCs(basename: str, embed_allowed_smarts: str = None, 
                     cap_allowed_smarts: str = None, cap_protonate: bool = False, 
                     pattern_to_label_mapping_standard = dict[str, str], 
                     variant_dict = dict[str, tuple]) -> list[ChemicalComponent]: 
    """
    Build a linked chemical component from a CIF file.

    Parameters
    ----------
    basename : str
        The name of the chemical component to build.
    embed_allowed_smarts : str
        Allowed SMARTS pattern for embedding (default is None).
    cap_allowed_smarts : str
        Allowed SMARTS pattern for capping (default is None).
    cap_protonate : bool
        Flag to add a proton to the atom's formal charge (default is False).
    pattern_to_label_mapping_standard : dict[str, str]
        Dictionary mapping patterns to link labels (default is empty dictionary).
    variant_dict : dict[str, tuple]
        Dictionary mapping suffixes to tuples of embedding and capping SMARTS patterns.
        keys: suffixes for the variants
        values: tuples containing the embedding and capping SMARTS patterns (embedding smarts, capping smarts)
    
    Returns
    -------
    list[ChemicalComponent]
        List of ChemicalComponent instances with the added variants.
    """
    with ChemicalComponent_LoggingControler(): 
        cc_from_cif = ChemicalComponent.from_cif(fetch_from_pdb(basename), basename)
        if cc_from_cif is None:
            return None
        
        # when no specific recipe is provided
        if not embed_allowed_smarts:
            # interpret possible embedding type
            AA = cc_from_cif.rdkit_mol.GetSubstructMatch(Chem.MolFromSmarts(AA_recipe.embed_allowed_smarts))
            NA = cc_from_cif.rdkit_mol.GetSubstructMatch(Chem.MolFromSmarts(NA_recipe.embed_allowed_smarts))

            if not AA and not NA: 
                logger.warning(f"Molecule doesn't contain the standard backbone of nucleotides or amino acids. -> no templates will be made. ")
                return None
            
        else: 
            # check if custom embedding is editable
            editable = cc_from_cif.rdkit_mol.GetSubstructMatch(Chem.MolFromSmarts(embed_allowed_smarts))
            if not editable:
                logger.warning(f"Molecule doesn't contain embed_allowed_smarts: {embed_allowed_smarts} -> no templates will be made. ")
                return None
            
            if not any(item is None for item in (cap_allowed_smarts, pattern_to_label_mapping_standard, variant_dict)):
                return add_variants(cc_orig = cc_from_cif, cc_list = [], 
                            embed_allowed_smarts = embed_allowed_smarts, 
                            cap_allowed_smarts = cap_allowed_smarts, cap_protonate = cap_protonate, 
                            pattern_to_label_mapping_standard = pattern_to_label_mapping_standard, 
                            variant_dict = variant_dict)
        
        cc_variants = []
        if AA: 
            cc_variants = add_variants(cc_orig = cc_from_cif, cc_list = cc_variants, 
                            embed_allowed_smarts = AA_recipe.embed_allowed_smarts, 
                            cap_allowed_smarts = AA_recipe.cap_allowed_smarts, cap_protonate = AA_recipe.cap_protonate, 
                            pattern_to_label_mapping_standard = AA_recipe.pattern_to_label_mapping_standard, 
                            variant_dict = AA_recipe.variant_dict)
        if NA:
            cc_variants = add_variants(cc_orig = cc_from_cif, cc_list = cc_variants, 
                embed_allowed_smarts = NA_recipe.embed_allowed_smarts, 
                cap_allowed_smarts = NA_recipe.cap_allowed_smarts, cap_protonate = NA_recipe.cap_protonate, 
                pattern_to_label_mapping_standard = NA_recipe.pattern_to_label_mapping_standard, 
                variant_dict = NA_recipe.variant_dict)
            
    return cc_variants


# Not implemented: 
# XXX read from prepared? enumerate in stepwise? all protonation state variants, alter charge and update smiles/idx