Source code for meeko.covalentbuilder

from collections import namedtuple
import logging
from rdkit import Chem
from rdkit.Chem import AllChem, rdGeometry
import prody
import warnings

logger = logging.getLogger(__name__)

from collections import defaultdict
def nested_dict():
    return defaultdict(nested_dict)



# class CovLigandPrepared(object):

#     """ container class to store the prepared ligand molecule with associated information"""
#     def __init__(self, mol, res_id, at_names, smarts, indices, smarts_indices):
#         self.mol = mol
#         self.res_id = res_id
#         self.at_names = at_names
#         self.smarts = smarts
#         self.indices = indices
#         self.smarts_indices = smarts_indices

#     @property
#     def label(self):
#         """ DOCUMENTATION GOES HERE """
#         pass

#     @property
#     def label(self):
#         """ DOCUMENTATION GOES HERE """
#         pass





CovLigandPrepared = namedtuple("CovalentLigandPrepared", ["mol", "res_id", "at_names", "smarts", "smarts_indices", "indices", "label"])



[docs]
class CovalentBuilder(object):
    """
    Class to perform structural alignments of ligands containing the target side chain attached to run tethered covalent dockings. The class is instantiated for a given target, with a list of one or more residues, then ligands can be processed sequentially ( CovalentBuilder.process() )

    Attributes
    ----------
    rec : ProDy molecule
        ProDy molecule object for the target residue
    residues : dict
        Dictionary with the structure { res_id : (at1_coord, at2_coord), ...}
    """

    def __init__(self, receptor_mol, residue_string):
        """
        Initializes the CovalentBuilder object with a ProDy molecule and a residue string.
        
        Parameters
        ----------
        receptor_mol : ProDy molecule
            ProDy molecule object for the target residue.
        residue_string : str
            String specifying residues to process. Examples:
                ":LYS:"              (all LYS, default atom names: CA,CB)
                "A:LYS:204:CA,CB"
                "A:HIS::ND1,CE1"     (all HIS on chain A)
                (ProDy regular expressions could be also injected here).
        """
        self.rec = receptor_mol
        selection_tuple = self.parse_residue_string(residue_string, force_CA_CB=True)
        # selection tuple: (chain, res, num, atname1, atname2)
        # only 'res' and 'atname[1|2]' are required, the rest is optional,
        # e.g: (None, res, None, 'CA', 'CB')
        out = self._generate_prody_selection(selection_tuple)
        self._compact_selection(out)

    def _generate_prody_selection(self, residue):
        """
        Generate the string to perform a Prody.Selection. 
        
        Parameters
        ----------
        residue : tuple
            Tuple containing the residue information (chain, res, num, atname1, atname2).

        Returns
        -------
        tuple
            Tuple containing the ProDy selection and the atom names (atname1, atname2).

        Raises
        -------
        ValueError
            If no residue is found with the specified selection.
        """
        # sel_string = 'chid %s AND resname %s AND resnum %s AND (name %s or name %s)'
        sel_string = []
        # out = []
        chid, res_type, res_num, atname1, atname2 = residue
        if not chid == "":
            sel_string.append("chid %s" % chid)
        sel_string.append('resname %s' % res_type)
        if not res_num == "":
            sel_string.append("resnum %s" % res_num)
        sel_string.append("(name %s or name %s)" % (atname1, atname2))
        sel_string = " and ".join(sel_string)
        logger.info("CovalentBuilder> searching for residue: %s" % sel_string)
        found = self.rec.select( sel_string )
        if found is None:
            logger.error("ERROR: no residue found with the following specification: chain[%s] residue[%s] number[%s] atom names [%s,%s]"% (
                chid, res_type, res_num, atname1, atname2))
            raise ValueError
        return (found, atname1, atname2)
        # returnsout

    def _compact_selection(self, sel_info, allow_missing=False):
        """
        Process a ProDy selection and return a dictionary with the structure { res_id : (at1_coord, at2_coord), ...}.

        Parameters
        ----------
        sel_info : tuple
            Tuple containing the ProDy selection and the atom names (atname1, atname2).
        allow_missing : bool, optional
            If True, allow missing atoms in the selection. Default is False.

        Raises
        -------
        ValueError
            If allow_missing is False and one or more atoms are missing in the selection.
        RuntimeWarning
            If one or more atoms are missing in the selection and allow_missing is True.    
        """
        self.residues = {}
        # for (sel, at1, at2) in sel_info:
        sel, at1, at2 = sel_info
        pairs = (at1, at2)
        chains = sel.getChids()
        res = sel.getResnames()
        num = sel.getResnums()
        names = sel.getNames()
        coords = sel.getCoords()
        for i in range(len(chains)):
            res_id = (chains[i], res[i], num[i])
            if not res_id in self.residues:
                self.residues[res_id] = [None, None]
            # preserve the order of the coordinates
            # as specified by the atom names order
            idx = pairs.index(names[i])
            self.residues[res_id][idx] = {'coords':coords[i], 'atname':names[i]}
        res_id_list = list(self.residues.keys())
        for res_id in res_id_list:
            if None in self.residues[res_id]:
                c,r,n = res_id
                f = [ x['atname'] if not x is None else "None" for x in  self.residues[res_id]]
                warnings.warn("One or more atoms are missing in residue %s:%s%d (requested: %s,%s | found: %s,%s)" % (c,r,n,at1,at2, f[0], f[1]), RuntimeWarning)
                if not allow_missing:
                    raise ValueError
                del self.residues[res_id]
        # from pprint import pprint as pp
        # pp(self.residues)


[docs]
    def process(self, ligand, smarts=None, smarts_indices=None, indices=None, first_only=False):
        """
        Process the ligand for the residue(s) specified for the current receptor. 

        Parameters
        ----------
        ligand : RDKit molecule
            RDKit molecule object for the ligand to be processed.
        smarts : str, optional
            SMARTS pattern to search for in the ligand. Default is None.
        smarts_indices : tuple, optional
            Tuple containing the indices of the atoms in the SMARTS pattern. Default is None.
        indices : list, optional
            List of tuples containing the indices of the atoms in the ligand. Default is None.
        first_only : bool, optional
            If True, only the first match of the SMARTS pattern will be used. Default is False.
        
        Yields
        -------
        CovLigandPrepared
            A named tuple containing the prepared ligand molecule, residue information, atom names, SMARTS pattern, indices, and label.
        
        Raises
        -------
        ValueError
            If neither smarts nor indices are specified.
        RuntimeWarning
            If the SMARTS pattern doesn't match any atoms in the ligand.
        """
        if (smarts is None) and (indices is None):
            raise ValueError("Specify at least one criterion, either SMARTS pattern or atom indices (2)")
        # if SMARTS are specified, use that to define (or override) indices
        if not smarts is None:
            indices = self.find_smarts(ligand, smarts, smarts_indices, first_only)
        if len(indices) == 0:
            warnings.warn("SMARTS pattern didn't match any atoms", RuntimeWarning)
        #print("CovalentBuilder> Generating %d ouput alignments (%d residues)" % (len(indices)*len(self.residues), len(self.residues) ))
        # perform alignments
        for i, idx_pair in enumerate(indices):
            for res_info, res_coord in self.residues.items():
                if len(indices)>1:
                    counter = "_%d" % (i+1)
                else:
                    counter = ""
                chain, res, num, = res_info
                at_names = [ x['atname'] for x in res_coord ]
                coord = [ x['coords'] for x in res_coord ]
                label = "%s:%s%s%s" % (chain, res, num, counter)
                mol = self.transform(ligand, idx_pair, coord)
                yield CovLigandPrepared(mol, res_info, at_names, smarts, smarts_indices, idx_pair,label)



[docs]
    def find_smarts(self, mol, smarts, smarts_indices, first_only):
        """
        Find occurrences of the SMARTS indices atoms in the requested SMARTS. 
        
        Parameters
        ----------
        mol : RDKit molecule
            RDKit molecule object for the ligand to be processed.
        smarts : str
            SMARTS pattern to search for in the ligand.
        smarts_indices : tuple
            Tuple containing the indices of the atoms in the SMARTS pattern.
        first_only : bool
            If True, only the first match of the SMARTS pattern will be used. Default is False.

        Returns
        -------
        list
            List of tuples containing the indices of the atoms in the ligand.

        Raises
        -------
        ValueError
            If the SMARTS index exceeds the number of atoms in the SMARTS pattern.  
        RuntimeWarning
            If the specified ligand pattern returned more than one match.
        """
        indices = []
        patt = Chem.MolFromSmarts(smarts)
        smarts_size = patt.GetNumAtoms()
        if smarts_indices[0] >= smarts_size or smarts_indices[1] >= smarts_size:
            raise ValueError("SMARTS index exceeds number of atoms in SMARTS (%d)" % (smarts_size))
        found = mol.GetSubstructMatches(patt)
        #print("CovalentBuilder> ligand patterns found: ", found, "[ use only first: %s ]" % first_only)
        if len(found)>1 and first_only:
            warnings.warn("The specified ligand pattern returned more than one match: [%d] (potential ambiguity?)" % len(found), RuntimeWarning)
        for f in found:
            #print("CovalentBuilder> processing:", f, "with ", smarts_indices)
            indices.append([f[x] for x in smarts_indices])
            #print("CovalentBuilder> ligand indices stored:", indices)
            if first_only:
                return indices
        return indices



[docs]
    def transform(self, ligand, index_pair, coord):
        """
        Generate translatead and aligned molecules for each of the indices requested and for all the residues defined in the class constructor. SOURCE: https://sourceforge.net/p/rdkit/mailman/message/36750909/

        Parameters
        ----------
        ligand : RDKit molecule
            RDKit molecule object for the ligand to be processed.
        index_pair : tuple
            Tuple containing the indices of the atoms in the ligand.
        coord : list
            List of tuples containing the coordinates of the atoms in the target residue.
        
        Returns
        -------
        RDKit molecule
            RDKit molecule object for the transformed ligand.
        """
        # make a copy of the ligand
        # TODO: maybe define new conformers?
        mol = Chem.Mol(ligand)
        target = Chem.MolFromSmiles("CC")
        # add hydrogens
        target = Chem.AddHs(target)
        # generate 3D coords
        AllChem.EmbedMolecule(target)
        # get the first conformer
        conf = target.GetConformer()
        # set coordinates to the ones of the actual target residue
        c1 = rdGeometry.Point3D( coord[0][0], coord[0][1], coord[0][2] )
        c2 = rdGeometry.Point3D( coord[1][0], coord[1][1], coord[1][2] )
        conf.SetAtomPosition(0, c1)
        conf.SetAtomPosition(1, c2)
        # perform alignment
        Chem.rdMolAlign.AlignMol(mol, target, -1, -1,[(index_pair[0],0), (index_pair[1], 1)] )
        return mol



[docs]
    @classmethod
    def parse_residue_string(cls, string, force_CA_CB=True):
        """
        Parse the residue string and return a tuple with the residue information.
        The string can be in the format "CHAIN:RES:NUM" or "CHAIN:RES:NUM:ATOM1,ATOM2".
        
        Parameters
        ----------
        string : str
            String specifying residues to process. 
        force_CA_CB : bool, optional
            If True, force the atom names to be CA and CB. Default is True.
        
        Returns
        -------
        tuple
            Tuple containing the residue information (chain, res, num, atname1, atname2).
        
        Raises
        -------
        ValueError
            If the residue string is not in the expected format.
        RuntimeError
            If the atom names are not CA and CB and force_CA_CB is True.
        """
        chain, res, num, *atom_names = string.split(":")
        #print("PARSED c:%s r:%s n:%s AAA:%s" % (chain, res, num, atom_names))
        if not num == "":
            if not num.isdigit():
                raise ValueError('NUM must be an integer in "CHAIN:RES:NUM" or "CHAIN:RES:NUM:ATOM1,ATOM2"')
        # parse atom names
        #  TODO clean all specifications
        if atom_names == []:
            atom_names = "CA,CB"
        else:
            atom_names = atom_names[0]
        # trigger error if comma not found
        a1, a2 = atom_names.split(",")
        # check that the legacy_format flag is not used
        if force_CA_CB and (not a1 == 'CA' or not a2 == 'CB'):
            msg  = "Atom names expected to be CA,CB but got %s,%s instead.\n" % (a1, a2)
            raise RuntimeError(msg)
        residue = (chain, res, num, a1, a2)
        return residue