from collections import namedtuple
import logging
from rdkit import Chem
from rdkit.Chem import AllChem, rdGeometry
import prody
import warnings
logger = logging.getLogger(__name__)
from collections import defaultdict
def nested_dict():
return defaultdict(nested_dict)
# class CovLigandPrepared(object):
# """ container class to store the prepared ligand molecule with associated information"""
# def __init__(self, mol, res_id, at_names, smarts, indices, smarts_indices):
# self.mol = mol
# self.res_id = res_id
# self.at_names = at_names
# self.smarts = smarts
# self.indices = indices
# self.smarts_indices = smarts_indices
# @property
# def label(self):
# """ DOCUMENTATION GOES HERE """
# pass
# @property
# def label(self):
# """ DOCUMENTATION GOES HERE """
# pass
CovLigandPrepared = namedtuple("CovalentLigandPrepared", ["mol", "res_id", "at_names", "smarts", "smarts_indices", "indices", "label"])
[docs]
class CovalentBuilder(object):
"""
Class to perform structural alignments of ligands containing the target side chain attached to run tethered covalent dockings. The class is instantiated for a given target, with a list of one or more residues, then ligands can be processed sequentially ( CovalentBuilder.process() )
Attributes
----------
rec : ProDy molecule
ProDy molecule object for the target residue
residues : dict
Dictionary with the structure { res_id : (at1_coord, at2_coord), ...}
"""
def __init__(self, receptor_mol, residue_string):
"""
Initializes the CovalentBuilder object with a ProDy molecule and a residue string.
Parameters
----------
receptor_mol : ProDy molecule
ProDy molecule object for the target residue.
residue_string : str
String specifying residues to process. Examples:
":LYS:" (all LYS, default atom names: CA,CB)
"A:LYS:204:CA,CB"
"A:HIS::ND1,CE1" (all HIS on chain A)
(ProDy regular expressions could be also injected here).
"""
self.rec = receptor_mol
selection_tuple = self.parse_residue_string(residue_string, force_CA_CB=True)
# selection tuple: (chain, res, num, atname1, atname2)
# only 'res' and 'atname[1|2]' are required, the rest is optional,
# e.g: (None, res, None, 'CA', 'CB')
out = self._generate_prody_selection(selection_tuple)
self._compact_selection(out)
def _generate_prody_selection(self, residue):
"""
Generate the string to perform a Prody.Selection.
Parameters
----------
residue : tuple
Tuple containing the residue information (chain, res, num, atname1, atname2).
Returns
-------
tuple
Tuple containing the ProDy selection and the atom names (atname1, atname2).
Raises
-------
ValueError
If no residue is found with the specified selection.
"""
# sel_string = 'chid %s AND resname %s AND resnum %s AND (name %s or name %s)'
sel_string = []
# out = []
chid, res_type, res_num, atname1, atname2 = residue
if not chid == "":
sel_string.append("chid %s" % chid)
sel_string.append('resname %s' % res_type)
if not res_num == "":
sel_string.append("resnum %s" % res_num)
sel_string.append("(name %s or name %s)" % (atname1, atname2))
sel_string = " and ".join(sel_string)
logger.info("CovalentBuilder> searching for residue: %s" % sel_string)
found = self.rec.select( sel_string )
if found is None:
logger.error("ERROR: no residue found with the following specification: chain[%s] residue[%s] number[%s] atom names [%s,%s]"% (
chid, res_type, res_num, atname1, atname2))
raise ValueError
return (found, atname1, atname2)
# returnsout
def _compact_selection(self, sel_info, allow_missing=False):
"""
Process a ProDy selection and return a dictionary with the structure { res_id : (at1_coord, at2_coord), ...}.
Parameters
----------
sel_info : tuple
Tuple containing the ProDy selection and the atom names (atname1, atname2).
allow_missing : bool, optional
If True, allow missing atoms in the selection. Default is False.
Raises
-------
ValueError
If allow_missing is False and one or more atoms are missing in the selection.
RuntimeWarning
If one or more atoms are missing in the selection and allow_missing is True.
"""
self.residues = {}
# for (sel, at1, at2) in sel_info:
sel, at1, at2 = sel_info
pairs = (at1, at2)
chains = sel.getChids()
res = sel.getResnames()
num = sel.getResnums()
names = sel.getNames()
coords = sel.getCoords()
for i in range(len(chains)):
res_id = (chains[i], res[i], num[i])
if not res_id in self.residues:
self.residues[res_id] = [None, None]
# preserve the order of the coordinates
# as specified by the atom names order
idx = pairs.index(names[i])
self.residues[res_id][idx] = {'coords':coords[i], 'atname':names[i]}
res_id_list = list(self.residues.keys())
for res_id in res_id_list:
if None in self.residues[res_id]:
c,r,n = res_id
f = [ x['atname'] if not x is None else "None" for x in self.residues[res_id]]
warnings.warn("One or more atoms are missing in residue %s:%s%d (requested: %s,%s | found: %s,%s)" % (c,r,n,at1,at2, f[0], f[1]), RuntimeWarning)
if not allow_missing:
raise ValueError
del self.residues[res_id]
# from pprint import pprint as pp
# pp(self.residues)
[docs]
def process(self, ligand, smarts=None, smarts_indices=None, indices=None, first_only=False):
"""
Process the ligand for the residue(s) specified for the current receptor.
Parameters
----------
ligand : RDKit molecule
RDKit molecule object for the ligand to be processed.
smarts : str, optional
SMARTS pattern to search for in the ligand. Default is None.
smarts_indices : tuple, optional
Tuple containing the indices of the atoms in the SMARTS pattern. Default is None.
indices : list, optional
List of tuples containing the indices of the atoms in the ligand. Default is None.
first_only : bool, optional
If True, only the first match of the SMARTS pattern will be used. Default is False.
Yields
-------
CovLigandPrepared
A named tuple containing the prepared ligand molecule, residue information, atom names, SMARTS pattern, indices, and label.
Raises
-------
ValueError
If neither smarts nor indices are specified.
RuntimeWarning
If the SMARTS pattern doesn't match any atoms in the ligand.
"""
if (smarts is None) and (indices is None):
raise ValueError("Specify at least one criterion, either SMARTS pattern or atom indices (2)")
# if SMARTS are specified, use that to define (or override) indices
if not smarts is None:
indices = self.find_smarts(ligand, smarts, smarts_indices, first_only)
if len(indices) == 0:
warnings.warn("SMARTS pattern didn't match any atoms", RuntimeWarning)
#print("CovalentBuilder> Generating %d ouput alignments (%d residues)" % (len(indices)*len(self.residues), len(self.residues) ))
# perform alignments
for i, idx_pair in enumerate(indices):
for res_info, res_coord in self.residues.items():
if len(indices)>1:
counter = "_%d" % (i+1)
else:
counter = ""
chain, res, num, = res_info
at_names = [ x['atname'] for x in res_coord ]
coord = [ x['coords'] for x in res_coord ]
label = "%s:%s%s%s" % (chain, res, num, counter)
mol = self.transform(ligand, idx_pair, coord)
yield CovLigandPrepared(mol, res_info, at_names, smarts, smarts_indices, idx_pair,label)
[docs]
def find_smarts(self, mol, smarts, smarts_indices, first_only):
"""
Find occurrences of the SMARTS indices atoms in the requested SMARTS.
Parameters
----------
mol : RDKit molecule
RDKit molecule object for the ligand to be processed.
smarts : str
SMARTS pattern to search for in the ligand.
smarts_indices : tuple
Tuple containing the indices of the atoms in the SMARTS pattern.
first_only : bool
If True, only the first match of the SMARTS pattern will be used. Default is False.
Returns
-------
list
List of tuples containing the indices of the atoms in the ligand.
Raises
-------
ValueError
If the SMARTS index exceeds the number of atoms in the SMARTS pattern.
RuntimeWarning
If the specified ligand pattern returned more than one match.
"""
indices = []
patt = Chem.MolFromSmarts(smarts)
smarts_size = patt.GetNumAtoms()
if smarts_indices[0] >= smarts_size or smarts_indices[1] >= smarts_size:
raise ValueError("SMARTS index exceeds number of atoms in SMARTS (%d)" % (smarts_size))
found = mol.GetSubstructMatches(patt)
#print("CovalentBuilder> ligand patterns found: ", found, "[ use only first: %s ]" % first_only)
if len(found)>1 and first_only:
warnings.warn("The specified ligand pattern returned more than one match: [%d] (potential ambiguity?)" % len(found), RuntimeWarning)
for f in found:
#print("CovalentBuilder> processing:", f, "with ", smarts_indices)
indices.append([f[x] for x in smarts_indices])
#print("CovalentBuilder> ligand indices stored:", indices)
if first_only:
return indices
return indices
[docs]
@classmethod
def parse_residue_string(cls, string, force_CA_CB=True):
"""
Parse the residue string and return a tuple with the residue information.
The string can be in the format "CHAIN:RES:NUM" or "CHAIN:RES:NUM:ATOM1,ATOM2".
Parameters
----------
string : str
String specifying residues to process.
force_CA_CB : bool, optional
If True, force the atom names to be CA and CB. Default is True.
Returns
-------
tuple
Tuple containing the residue information (chain, res, num, atname1, atname2).
Raises
-------
ValueError
If the residue string is not in the expected format.
RuntimeError
If the atom names are not CA and CB and force_CA_CB is True.
"""
chain, res, num, *atom_names = string.split(":")
#print("PARSED c:%s r:%s n:%s AAA:%s" % (chain, res, num, atom_names))
if not num == "":
if not num.isdigit():
raise ValueError('NUM must be an integer in "CHAIN:RES:NUM" or "CHAIN:RES:NUM:ATOM1,ATOM2"')
# parse atom names
# TODO clean all specifications
if atom_names == []:
atom_names = "CA,CB"
else:
atom_names = atom_names[0]
# trigger error if comma not found
a1, a2 = atom_names.split(",")
# check that the legacy_format flag is not used
if force_CA_CB and (not a1 == 'CA' or not a2 == 'CB'):
msg = "Atom names expected to be CA,CB but got %s,%s instead.\n" % (a1, a2)
raise RuntimeError(msg)
residue = (chain, res, num, a1, a2)
return residue