#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Meeko
#
from rdkit import Chem
from rdkit.Geometry import Point3D
from rdkit.Chem import AllChem
from meeko.utils.rdkitutils import set_h_isotope_atom_coords
from io import StringIO
import json
def clean_extend(existing_dict, new_row):
"""extend existing_dict with new_row, which is a dict"""
nr_rows = []
for key in existing_dict:
nr_rows.append(len(existing_dict[key]))
if key not in new_row:
existing_dict[key].append(None)
if len(nr_rows) == 0: # existing_dict is empty
nr_rows = 0
elif len(set(nr_rows)) != 1:
msg = "existing_dict has different nr of items for different attributes"
raise ValueError(msg)
else:
nr_rows = set(nr_rows).pop()
for key, value in new_row.items():
if key not in existing_dict:
existing_dict[key] = [None] * nr_rows
existing_dict[key].append(value)
return
[docs]
class RDKitMolCreate:
"""
Utilities for constructing RDKit molecules from PDBQT docking results.
"""
ambiguous_flexres_choices = {
"HIS": ["HIE", "HID", "HIP"],
"ASP": ["ASP", "ASH"],
"GLU": ["GLU", "GLH"],
"CYS": ["CYS", "CYM"],
"LYS": ["LYS", "LYN"],
"ARG": ["ARG", "ARG_mgltools"],
"ASN": ["ASN", "ASN_mgltools"],
"GLN": ["GLN", "GLN_mgltools"],
}
flexres = {
"CYS": {
"smiles": "CCS",
"atom_names_in_smiles_order": ["CA", "CB", "SG"],
"h_to_parent_index": {"HG": 2},
},
"CYM": {
"smiles": "CC[S-]",
"atom_names_in_smiles_order": ["CA", "CB", "SG"],
"h_to_parent_index": {},
},
"ASP": {
"smiles": "CCC(=O)[O-]",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "OD1", "OD2"],
"h_to_parent_index": {},
},
"ASH": {
"smiles": "CCC(=O)O",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "OD1", "OD2"],
"h_to_parent_index": {"HD2": 4},
},
"GLU": {
"smiles": "CCCC(=O)[O-]",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "OE1", "OE2"],
"h_to_parent_index": {},
},
"GLH": {
"smiles": "CCCC(=O)O",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "OE1", "OE2"],
"h_to_parent_index": {"HE2": 5},
},
"PHE": {
"smiles": "CCc1ccccc1",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD1", "CE1", "CZ", "CE2", "CD2"],
"h_to_parent_index": {},
},
"HIE" : {
"smiles": "CCc1c[nH]cn1",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD2", "NE2", "CE1", "ND1"],
"h_to_parent_index": {"HE2": 4},
},
"HID" : {
"smiles": "CCc1cnc[nH]1",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD2", "NE2", "CE1", "ND1"],
"h_to_parent_index": {"HD1": 6},
},
"HIP" : {
"smiles": "CCc1c[nH+]c[nH]1",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD2", "NE2", "CE1", "ND1"],
"h_to_parent_index": {"HE2": 4, "HD1": 6},
},
"ILE": {
"smiles": "CC(C)CC",
"atom_names_in_smiles_order": ["CA", "CB", "CG2", "CG1", "CD1"],
"h_to_parent_index": {},
},
"LYS": {
"smiles": "CCCCC[NH3+]",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "CE", "NZ"],
"h_to_parent_index": {"HZ1": 5, "HZ2": 5, "HZ3": 5},
},
"LYN": {
"smiles": "CCCCCN",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "CE", "NZ"],
"h_to_parent_index": {"HZ2": 5, "HZ3": 5},
},
"LEU": {
"smiles": "CCC(C)C",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD1", "CD2"],
"h_to_parent_index": {},
},
"MET": {
"smiles": "CCCSC",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "SD", "CE"],
"h_to_parent_index": {},
},
"ASN": {
"smiles": "CCC(=O)N",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "OD1", "ND2"],
"h_to_parent_index": {"HD21": 4, "HD22": 4},
},
"ASN_mgltools": {
"smiles": "CCC(=O)N",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "OD1", "ND2"],
"h_to_parent_index": {"1HD2": 4, "2HD2": 4},
},
"GLN": {
"smiles": "CCCC(=O)N",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "OE1", "NE2"],
"h_to_parent_index": {"HE21": 5, "HE22": 5},
},
"GLN_mgltools": {
"smiles": "CCCC(=O)N",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "OE1", "NE2"],
"h_to_parent_index": {"1HE2": 5, "2HE2": 5},
},
"ARG": {
"smiles": "CCCCNC(N)=[NH2+]",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2"],
"h_to_parent_index": {"HE": 4, "HH11": 6, "HH12": 6, "HH21": 7, "HH22": 7},
},
"ARG_mgltools": {
"smiles": "CCCCNC(N)=[NH2+]",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2"],
"h_to_parent_index": {"HE": 4, "1HH1": 6, "2HH1": 6, "1HH2": 7, "2HH2": 7},
},
"SER": {
"smiles": "CCO",
"atom_names_in_smiles_order": ["CA", "CB", "OG"],
"h_to_parent_index": {"HG": 2},
},
"THR": {
"smiles": "CC(C)O",
"atom_names_in_smiles_order": ["CA", "CB", "CG2", "OG1"],
"h_to_parent_index": {"HG1": 3},
},
"VAL": {
"smiles": "CC(C)C",
"atom_names_in_smiles_order": ["CA", "CB", "CG1", "CG2"],
"h_to_parent_index": {},
},
"TRP": {
"smiles": "CCc1c[nH]c2c1cccc2",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD1", "NE1", "CE2", "CD2", "CE3", "CZ3", "CH2", "CZ2"],
"h_to_parent_index": {"HE1": 4},
},
"TYR": {
"smiles": "CCc1ccc(cc1)O",
"atom_names_in_smiles_order": ["CA", "CB", "CG", "CD1", "CE1", "CZ", "CE2", "CD2", "OH"],
"h_to_parent_index": {"HH": 8},
},
}
[docs]
@classmethod
def from_pdbqt_mol(
cls,
pdbqt_mol,
only_cluster_leads=False,
keep_flexres=False,
):
"""
Convert a PDBQT molecule into a list of RDKit molecules.
Parameters
----------
pdbqt_mol : PDBQT
The PDBQT molecule to convert.
only_cluster_leads : bool, optional
If True, only cluster leads are converted. Default is False.
keep_flexres : bool, optional
If True, flexible residues are kept. Default is False.
Returns
-------
list
A list of RDKit molecules. Each molecule corresponds to a pose in the PDBQT molecule.
Raises
------
RuntimeError
If no cluster leads are found in the PDBQT molecule and only_cluster_leads is True.
ValueError
If some atoms of a ligand are parsed as sidechain but not all.
"""
# todo: add pseudo-water (W atoms, variable nr each pose)
if only_cluster_leads and len(pdbqt_mol._pose_data["cluster_leads_sorted"]) == 0:
raise RuntimeError("no cluster_leads in pdbqt_mol but only_cluster_leads=True")
mol_list = []
for mol_index in pdbqt_mol._atom_annotations["mol_index"]:
flexres_id = pdbqt_mol._pose_data["mol_index_to_flexible_residue"][mol_index]
if flexres_id is not None and not keep_flexres:
continue
smiles = pdbqt_mol._pose_data['smiles'][mol_index]
index_map = pdbqt_mol._pose_data['smiles_index_map'][mol_index]
h_parent = pdbqt_mol._pose_data['smiles_h_parent'][mol_index]
atom_idx = pdbqt_mol._atom_annotations["mol_index"][mol_index]
atom_is_flex = [i in pdbqt_mol._atom_annotations["flexible_residue"] for i in atom_idx]
if any(atom_is_flex) and all(atom_is_flex):
is_sidechain = True
elif any(atom_is_flex):
raise ValueError("some (but not all!) atoms of a ligand were parsed as sidechain")
else:
is_sidechain = False
if smiles is None: # probably a flexible sidechain, but can be another ligand
residue_names = set()
atom_names = []
for atom in pdbqt_mol.atoms(atom_idx):
residue_names.add(atom[4])
atom_names.append(atom[2])
if len(residue_names) == 1:
resname = residue_names.pop()
smiles, index_map, h_parent = cls.guess_flexres_smiles(resname, atom_names)
if smiles is None: # failed guessing smiles for possible flexres
mol_list.append(None)
continue
if only_cluster_leads:
pose_ids = pdbqt_mol._pose_data["cluster_leads_sorted"]
else:
pose_ids = range(pdbqt_mol._pose_data["n_poses"])
mol = Chem.MolFromSmiles(smiles)
mol.SetProp("meeko", json.dumps({"is_sidechain": is_sidechain}))
coordinates_all_poses = []
for i in pose_ids:
pdbqt_mol._current_pose = i
coordinates = pdbqt_mol.positions(atom_idx)
mol = cls.add_pose_to_mol(mol, coordinates, index_map)
coordinates_all_poses.append(coordinates)
# add Hs only after all poses are added as conformers
# because Chem.AddHs() will affect all conformers at once
mol = cls.add_hydrogens(mol, coordinates_all_poses, h_parent)
mol_list.append(mol)
return mol_list
[docs]
@classmethod
def guess_flexres_smiles(cls, resname, atom_names):
"""
Determine a SMILES string for flexres based on atom names,
as well as the equivalent of smile_index_map and smiles_h_parent
which are written to PDBQT remarks for regular small molecules.
Parameters
----------
resname : str
The residue name of the flexible residue.
atom_names : list of str
The names of the atoms in the flexible residue.
Returns
-------
tuple[str, list, list]
A tuple containing:
- smiles: SMILES string starting at C-alpha (excludes most of the backbone)
- index_map: list of pairs of integers, first in pair is index in the smiles,
second is index of corresponding atom in atom_names
- h_parent: list of pairs of integers, first in pair is index of a heavy atom
in the smiles, second is index of a hydrogen in atom_names.
The hydrogen is bonded to the heavy atom.
"""
if len(set(atom_names)) != len(atom_names):
return None, None, None
candidate_resnames = cls.ambiguous_flexres_choices.get(resname, [resname])
for resname in candidate_resnames:
is_match = False
if resname not in cls.flexres:
continue
atom_names_in_smiles_order = cls.flexres[resname]["atom_names_in_smiles_order"]
h_to_parent_index = cls.flexres[resname]["h_to_parent_index"]
expected_names = atom_names_in_smiles_order + list(h_to_parent_index.keys())
if len(atom_names) != len(expected_names):
continue
nr_matched_atom_names = sum([int(n in atom_names) for n in expected_names])
if nr_matched_atom_names == len(expected_names):
is_match = True
break
if not is_match:
return None, None, None
else:
smiles = cls.flexres[resname]["smiles"]
index_map = []
for smiles_index, name in enumerate(atom_names_in_smiles_order):
index_map.append(smiles_index + 1)
index_map.append(atom_names.index(name) + 1)
h_parent = []
for name, smiles_index in h_to_parent_index.items():
h_parent.append(smiles_index + 1)
h_parent.append(atom_names.index(name) + 1)
return smiles, index_map, h_parent
[docs]
@classmethod
def add_pose_to_mol(cls, mol, ligand_coordinates, index_map):
"""
Add given coordinates to given molecule as new conformer.
Parameters
----------
mol : Chem.Mol
The RDKit molecule to which the coordinates will be added.
ligand_coordinates : list[list[float]]
2D array of shape (nr_atom, 3).
index_map : list[int]
list of nr_atom pairs of integers, 1-indexed.
In each pair, the first int is the index in mol, and
the second int is the index in ligand_coordinates (PDBQT).
Returns
-------
mol : Chem.Mol
The RDKit molecule with the new conformer added.
Raises
------
RuntimeError
Will raise error if number of coordinates provided does not
match the number of atoms there should be coordinates for.
"""
n_atoms = mol.GetNumAtoms()
n_mappings = int(len(index_map) / 2)
conf = Chem.Conformer(n_atoms)
if n_atoms < n_mappings:
raise RuntimeError(
"Number of atom is rdmol {n_atoms} mismatches"
"number of pairs in index map {n_at}!".format(
n_coords=n_atoms, n_at=n_mappings))
coord_is_set = [False] * n_atoms
for i in range(n_mappings):
pdbqt_index = int(index_map[i * 2 + 1]) - 1
mol_index = int(index_map[i * 2]) - 1
x, y, z = [float(coord) for coord in ligand_coordinates[pdbqt_index]]
conf.SetAtomPosition(mol_index, Point3D(x, y, z))
coord_is_set[mol_index] = True
# some hydrogens (isotopes) may have no coordinate set yet
h_isotope_pos_assignment = set_h_isotope_atom_coords(mol, conf = conf)
if h_isotope_pos_assignment:
for idx in h_isotope_pos_assignment:
conf.SetAtomPosition(idx, h_isotope_pos_assignment[idx])
coord_is_set[idx] = True
mol.AddConformer(conf, assignId=True)
for i, is_set in enumerate(coord_is_set):
if not is_set:
raise RuntimeError(f"Unable to set position for atom # {i} from docked pose in the created RDKit mol. ")
return mol
[docs]
@staticmethod
def add_hydrogens(mol, coordinates_list, h_parent):
"""
Add hydrogen atoms to ligand RDKit mol, adjust the positions of
polar hydrogens to match pdbqt.
Parameters
----------
mol : Chem.Mol
The RDKit molecule to which the hydrogens will be added.
coordinates_list : list[list[float]]
2D array of shape (nr_atom, 3).
h_parent : list[int]
list of pairs of integers, 1-indexed.
In each pair, the first int is the index in mol, and
the second int is the index in coordinates_list
Returns
-------
mol : Chem.Mol
The RDKit molecule with the new hydrogens added.
"""
mol = Chem.AddHs(mol, addCoords=True)
conformers = list(mol.GetConformers())
num_hydrogens = int(len(h_parent) / 2)
for conformer_idx, atom_coordinates in enumerate(coordinates_list):
conf = conformers[conformer_idx]
used_h = []
for i in range(num_hydrogens):
parent_rdkit_index = h_parent[2 * i] - 1
h_pdbqt_index = h_parent[2 * i + 1] - 1
x, y, z = [
float(coord) for coord in atom_coordinates[h_pdbqt_index]
]
parent_atom = mol.GetAtomWithIdx(parent_rdkit_index)
candidate_hydrogens = [
atom.GetIdx() for atom in parent_atom.GetNeighbors()
if atom.GetAtomicNum() == 1
]
for h_rdkit_index in candidate_hydrogens:
if h_rdkit_index not in used_h:
break
used_h.append(h_rdkit_index)
conf.SetAtomPosition(h_rdkit_index, Point3D(x, y, z))
return mol
[docs]
@staticmethod
def combine_rdkit_mols(mol_list):
"""
Combines list of rdkit molecules into a single one using Chem.CombineMols.
Parameters
----------
mol_list : list[Chem.Mol]
List of RDKit molecules to combine.
Returns
-------
Chem.Mol
Combined RDKit molecule with all conformers from the input list.
If all input molecules are None, returns None.
"""
combined_mol = None
props = {}
for mol in mol_list:
if mol is None:
continue
if mol.HasProp("meeko"):
data = json.loads(mol.GetProp("meeko"))
clean_extend(props, data)
if combined_mol is None: # first iteration
combined_mol = mol
else:
combined_mol = Chem.CombineMols(combined_mol, mol)
if len(props) > 0:
combined_mol.SetProp("meeko", json.dumps(props))
return combined_mol
@classmethod
def _verify_flexres(cls):
"""
Verify that the flexres dictionary is consistent and does not contain
duplicate atom names.
Raises
------
RuntimeError
If there are duplicate atom names in the flexres dictionary.
"""
for resname in cls.flexres:
atom_names_in_smiles_order = cls.flexres[resname]["atom_names_in_smiles_order"]
h_to_parent_index = cls.flexres[resname]["h_to_parent_index"]
expected_names = atom_names_in_smiles_order + list(h_to_parent_index.keys())
if len(expected_names) != len(set(expected_names)):
raise RuntimeError("repeated atom names in cls.flexres[%s]" % resname)
[docs]
@classmethod
def add_sandbox_coordinates(cls, dlgstring, rdmol, index_map, h_parent, groupname=None):
"""
Parse coordinates from a DLG file and append to RDKit mol.
The coordinates are sorted by energy, and the best pose is added first.
The function also adds the pose index to the RDKit molecule.
The coordinates are added as conformers to the RDKit molecule.
The function also adds the pose index to the RDKit molecule.
Parameters
----------
dlgstring : str
The DLG file content as a string.
rdmol : Chem.Mol
The RDKit molecule to which the coordinates will be added.
index_map : list[int]
List of pairs of integers, 1-indexed.
In each pair, the first int is the index in mol, and
the second int is the index in coordinates_list.
h_parent : list[int]
List of pairs of integers, 1-indexed.
In each pair, the first int is the index in mol, and
the second int is the index in coordinates_list.
groupname : str, optional
The name of the group to filter the coordinates. If None, all coordinates are used.
Default is None.
Returns
-------
tuple[Chem.Mol, dict]
A tuple containing:
- rdmol: The RDKit molecule with the new conformers added.
- energy: A dictionary containing the intermolecular and intramolecular energies
for each pose, as well as the pose index.
"""
# this function does not deal with implicit H, at least not yet
index_map = [i + 1 for i in index_map] # 1-indexing like in PDBQT
h_parent = [i + 1 for i in h_parent] # 1-indexing like in PDBQT
coordinates = []
energy = {"inter": [], "intra": [], "dlg_pose_idx": []}
is_atom_block = False
for line in dlgstring.split('\n'):
if line.startswith("Pose:") or line.startswith("Extra Pose:"):
if line.startswith("Pose:"):
pose_idx = int(line.split()[1])
elif line.startswith("Extra Pose:"):
pose_idx = int(line.split()[2])
if len(coordinates) > 0:
if len(coordinates[-1]) == 0:
# if pose info was missing, just delete data
energy["dlg_pose_idx"].pop(-1)
coordinates.pop(-1)
energy["dlg_pose_idx"].append(pose_idx)
coordinates.append([])
elif line.startswith("DOCKED: USER (1) Final Intermolecular Energy ="):
energy["inter"].append(float(line.split()[7]))
elif line.startswith("DOCKED: USER (2) Final Total Internal Energy ="):
energy["intra"].append(float(line.split()[8]))
elif line.startswith("DOCKED: @<TRIPOS>ATOM"):
is_atom_block = True
elif line.startswith("DOCKED: @<TRIPOS>BOND"):
is_atom_block = False
elif is_atom_block:
fields = line.split()
name = fields[8]
if groupname is None or name == groupname:
x, y, z = float(fields[3]), float(fields[4]), float(fields[5])
coordinates[-1].append([x, y, z])
if not (len(coordinates) == len(energy["inter"]) == len(energy["intra"])):
msg = "parsed energies differs from number of coordinates\n"
msg += "len(coordinates) = %d\n" % len(coordinates)
msg += "len(intra) = %d\n" % len(energy["intra"])
msg += "len(inter) = %d\n" % len(energy["inter"])
raise RuntimeError(msg)
scores = [energy["inter"][i] + energy["intra"][i] for i in range(len(coordinates))]
idxsort = [pair[0] for pair in sorted(enumerate(scores), key=lambda pair: pair[1])]
sorted_coordinates = []
for index in idxsort:
cls.add_pose_to_mol(rdmol, coordinates[index], index_map)
sorted_coordinates.append(coordinates[index])
rdmol = cls.add_hydrogens(rdmol, sorted_coordinates, h_parent)
for key in energy:
energy[key] = [energy[key][i] for i in idxsort]
return rdmol, energy
[docs]
@staticmethod
def write_sd_string(pdbqt_mol, only_cluster_leads=False, keep_flexres=False):
"""
Write a multi-conformer SDF string from a PDBQT molecule.
Parameters
----------
pdbqt_mol : PDBQT
The PDBQT molecule to convert.
only_cluster_leads : bool, optional
If True, only cluster leads are converted. Default is False.
keep_flexres : bool, optional
If True, flexible residues are kept. Default is False.
Returns
-------
tuple[str, list]
A tuple containing:
- output_string: The multi-conformer SDF string.
- failures: A list of indices of failed conversions.
"""
sio = StringIO()
f = Chem.SDWriter(sio)
mol_list = RDKitMolCreate.from_pdbqt_mol(pdbqt_mol, only_cluster_leads, keep_flexres)
failures = [i for i, mol in enumerate(mol_list) if mol is None]
combined_mol = RDKitMolCreate.combine_rdkit_mols(mol_list)
if combined_mol is None:
return "", failures
keys_map_mol_to_pdbqt = {
"free_energy": "free_energies",
"intermolecular_energy": "intermolecular_energies",
"internal_energy": "internal_energies",
"cluster_size": "cluster_size",
"cluster_id": "cluster_id",
"rank_in_cluster": "rank_in_cluster",
}
nr_poses = pdbqt_mol._pose_data["n_poses"]
if only_cluster_leads:
pose_idxs = pdbqt_mol._pose_data["cluster_leads_sorted"]
else:
pose_idxs = list(range(nr_poses))
available_properties = {}
for key_in_mol, key_in_pdbqt in keys_map_mol_to_pdbqt.items():
if len(pdbqt_mol._pose_data[key_in_pdbqt]) == nr_poses:
available_properties[key_in_mol] = key_in_pdbqt
mol_level_data = json.loads(combined_mol.GetProp("meeko"))
if pdbqt_mol.name is not None:
combined_mol.SetProp("_Name", pdbqt_mol.name)
for conformer in combined_mol.GetConformers():
i = conformer.GetId()
j = pose_idxs[i]
conformer_data = json.loads(json.dumps(mol_level_data))
for (key_in_mol, key_in_pdbqt) in available_properties.items():
if key_in_mol in conformer_data:
msg = "key %s conflict between combined_mol and write_sd_string" % key_in_mol
raise NotImplementedError(msg)
conformer_data[key_in_mol] = pdbqt_mol._pose_data[key_in_pdbqt][j]
if len(conformer_data): combined_mol.SetProp("meeko", json.dumps(conformer_data))
f.write(combined_mol, i)
f.close()
output_string = sio.getvalue()
return output_string, failures