import pathlib
import json
import logging
import traceback
from importlib.resources import files
import warnings
eol="\n"
from sys import exc_info
from typing import Union
from typing import Optional
from typing import Any
import rdkit.Chem
from rdkit import Chem
from rdkit.Chem import rdFMCS
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolInterchange
from rdkit.Geometry import Point3D
from .molsetup import RDKitMoleculeSetup
from .molsetup import MoleculeSetup
from .utils.jsonutils import BaseJSONParsable
from .utils.jsonutils import serialize_optional
from .utils.jsonutils import rdkit_mol_from_json
from .utils.jsonutils import convert_to_int_keyed_dict
from .utils.rdkitutils import mini_periodic_table
from .utils.rdkitutils import react_and_map
from .utils.rdkitutils import AtomField
from .utils.rdkitutils import build_one_rdkit_mol_per_altloc
from .utils.rdkitutils import _aux_altloc_mol_build
from .utils.rdkitutils import covalent_radius
from .utils.pdbutils import PDBAtomInfo
from .utils.rdkitutils import getPdbInfoNoNull
from .chemtempgen import export_chem_templates_to_json
from .chemtempgen import build_noncovalent_CC
from .chemtempgen import build_linked_CCs
import numpy as np
data_path = files("meeko") / "data"
periodic_table = Chem.GetPeriodicTable()
try:
import prody
except ImportError as _prody_import_error:
ALLOWED_PRODY_TYPES = None
AtomGroup = None
Selection = None
def prody_to_rdkit(*args):
raise ImportError(_prody_import_error)
else:
from .utils.prodyutils import prody_to_rdkit, ALLOWED_PRODY_TYPES
from prody.atomic.atomgroup import AtomGroup
from prody.atomic.selection import Selection
logger = logging.getLogger(__name__)
rdkit_logger = logging.getLogger("rdkit")
residues_rotamers = {
"SER": [("C", "CA", "CB", "OG")],
"THR": [("C", "CA", "CB", "CG2")],
"CYS": [("C", "CA", "CB", "SG")],
"VAL": [("C", "CA", "CB", "CG1")],
"HIS": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
"ASN": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "ND2")],
"ASP": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
"ILE": [("C", "CA", "CB", "CG2"), ("CA", "CB", "CG2", "CD1")],
"LEU": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD1")],
"PHE": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
"TYR": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
"TRP": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
"GLU": [
("C", "CA", "CB", "CG"),
("CA", "CB", "CG", "CD"),
("CB", "CG", "CD", "OE1"),
],
"GLN": [
("C", "CA", "CB", "CG"),
("CA", "CB", "CG", "CD"),
("CB", "CG", "CD", "OE1"),
],
"MET": [
("C", "CA", "CB", "CG"),
("CA", "CB", "CG", "SD"),
("CB", "CG", "SD", "CE"),
],
"ARG": [
("C", "CA", "CB", "CG"),
("CA", "CB", "CG", "CD"),
("CB", "CG", "CD", "NE"),
("CG", "CD", "NE", "CZ"),
],
"LYS": [
("C", "CA", "CB", "CG"),
("CA", "CB", "CG", "CD"),
("CB", "CG", "CD", "CE"),
("CG", "CD", "CE", "NZ"),
],
}
[docs]
def find_graph_paths(graph, start_node, end_nodes, current_path=(), paths_found=()):
"""
(DFS Path Search) Recursively find all paths from a start node to any of the specified end nodes in a directed graph.
Parameters
----------
graph : dict[int, list[int]]
A dictionary representing a graph. Keys are node IDs (integers), and values are lists of neighboring node IDs.
start_node : int
The node from which to begin path traversal.
end_nodes : iterable of int
A set or list of node IDs that are valid end points for a path.
current_path : tuple[int], optional
The current traversal path (used during recursion). Default is empty tuple.
paths_found : list[list[int]], optional
The list of complete paths found so far. Default is empty tuple, which will be converted internally.
Returns
-------
list[list[int]]
A list of paths. Each path is represented as a list of node IDs from start_node to an end_node.
"""
current_path = current_path + (start_node,)
paths_found = list(paths_found)
for node in graph[start_node]:
if node in current_path:
continue
if node in end_nodes:
paths_found.append(list(current_path) + [node])
more_paths = find_graph_paths(graph, node, end_nodes, current_path)
paths_found.extend(more_paths)
return paths_found
[docs]
def find_inter_mols_bonds(mols_dict):
"""
Finds inter-residue bonds between the RDKit Mols in mols_dict based on distances compared to built-in covalent radii (.utils.rdkitutils.covalent_radius).
Parameters
----------
mols_dict : dict[str, (Chem.Mol, str)]
A dictionary to store raw input mols and input residue names by residue IDs.
The keys are residue IDs in the format <chain>:<resnum> such as "A:42", and the values are tuples of an RDKit Mols and input resname.
Returns
-------
dict[(str, str), list[tuple[int, int]]]
A dictionary representing inter-residue bonds.
The keys are tuples of residue IDs in the format <chain>:<resnum>, and the values are lists of tuples of atom indices in the two residues that are bonded.
"""
allowance = 1.2 # vina uses 1.1 but covalent radii are shorter here
max_possible_covalent_radius = (
2 * allowance * max([r for k, r in covalent_radius.items()])
)
cubes_min = []
cubes_max = []
for key, (mol, _) in mols_dict.items():
positions = mol.GetConformer().GetPositions()
cubes_min.append(np.min(positions, axis=0))
cubes_max.append(np.max(positions, axis=0))
tmp = np.array([0, 0, 1, 1])
pairs_to_consider = []
keys = list(mols_dict)
for i in range(len(mols_dict)):
for j in range(i + 1, len(mols_dict)):
do_consider = True
for d in range(3):
x = (cubes_min[i][d], cubes_max[i][d], cubes_min[j][d], cubes_max[j][d])
idx = np.argsort(x)
has_overlap = tmp[idx][0] != tmp[idx][1]
close_enough = abs(x[idx[1]] - x[idx[2]]) < max_possible_covalent_radius
do_consider &= close_enough or has_overlap
if do_consider:
pairs_to_consider.append((i, j))
bonds = {} # key is pair mol indices, valuei is list of pairs of atom indices
for i, j in pairs_to_consider:
p1 = mols_dict[keys[i]][0].GetConformer().GetPositions()
p2 = mols_dict[keys[j]][0].GetConformer().GetPositions()
for a1 in mols_dict[keys[i]][0].GetAtoms():
for a2 in mols_dict[keys[j]][0].GetAtoms():
vec = p1[a1.GetIdx()] - p2[a2.GetIdx()]
distsqr = np.dot(vec, vec)
# check if atom has implemented covalent radius
for atom in [a1, a2]:
if atom.GetAtomicNum() not in covalent_radius:
raise RuntimeError(f"Element {periodic_table.GetElementSymbol(atom.GetAtomicNum())} doesn't have an implemented covalent radius, which was required for the perception of intermolecular bonds. ")
cov_dist = (
covalent_radius[a1.GetAtomicNum()]
+ covalent_radius[a2.GetAtomicNum()]
)
if distsqr < (allowance * cov_dist) ** 2:
key = (keys[i], keys[j])
value = (a1.GetIdx(), a2.GetIdx())
bonds.setdefault(key, [])
bonds[key].append(value)
return bonds
[docs]
def mapping_by_mcs(mol, ref):
"""
Find an atom-atom mapping between two molecules based on the maximum common substructure (MCS).
Parameters
----------
mol : Chem.Mol
The first molecule to compare.
ref : Chem.Mol
The second molecule to compare.
Returns
-------
dict[int, int]
A dictionary mapping atom indices from the first molecule to the second molecule.
The keys are atom indices from the first molecule, and the values are atom indices from the second molecule.
"""
mcs_result = rdFMCS.FindMCS([mol, ref], bondCompare=rdFMCS.BondCompare.CompareAny)
mcs_mol = Chem.MolFromSmarts(mcs_result.smartsString)
mol_idxs = mol.GetSubstructMatch(mcs_mol)
ref_idxs = ref.GetSubstructMatch(mcs_mol)
atom_map = {i: j for (i, j) in zip(mol_idxs, ref_idxs)}
return atom_map
def _snap_to_int(value, tolerance=0.12):
"""
Rounds a floating-point number to the nearest integer within a specified tolerance.
Parameters
----------
value : float
The floating-point number to be rounded.
tolerance : float, optional
The tolerance within which the number is considered close to an integer. Default is 0.12.
Returns
-------
int or None
The rounded integer if the value is within the tolerance of an integer, otherwise None.
"""
for inc in [-1, 0, 1]:
if abs(value - int(value) - inc) <= tolerance:
return int(value) + inc
return None
[docs]
def divide_int_gracefully(integer, weights, allow_equal_weights_to_differ=False):
"""
Divides an integer into parts based on the provided weights, ensuring that the sum of the parts equals the integer.
Parameters
----------
integer : int
The integer to be divided.
weights : list[float]
A list of weights that determine the proportions of the integer.
allow_equal_weights_to_differ : bool, optional
If True, allows equal weights to be treated as different groups. Default is False (equal inputs get equal outputs).
Returns
-------
list[float]
A list of floats representing the divided integer based on the weights.
"""
for weight in weights:
if type(weight) not in [int, float] or weight < 0:
raise ValueError("weights must be numeric and non-negative")
if type(integer) is not int:
raise ValueError("integer must be integer")
inv_total_weight = 1.0 / sum(weights)
shares = [w * inv_total_weight for w in weights] # normalize
result = [_snap_to_int(integer * s, tolerance=0.5) for s in shares]
surplus = integer - sum(result)
if surplus == 0:
return result
data = [(i, w) for (i, w) in enumerate(weights)]
data = sorted(data, key=lambda x: x[1], reverse=True)
idxs = [i for (i, _) in data]
if allow_equal_weights_to_differ:
groups = [1 for _ in weights]
else:
groups = []
last_weight = None
for i in idxs:
if weights[i] == last_weight:
groups[-1] += 1
else:
groups.append(1)
last_weight = weights[i]
# iterate over all possible combinations of groups
# this is potentially very slow
nr_groups = len(groups)
for j in range(1, 2**nr_groups):
n_changes = 0
combo = []
for grpidx in range(nr_groups):
is_changed = bool(j & 2**grpidx)
combo.append(is_changed)
n_changes += is_changed * groups[grpidx]
if n_changes == abs(surplus):
break
# add or subtract 1 to distribute surplus
increment = surplus / abs(surplus)
index = 0
for i, is_changed in enumerate(combo):
if is_changed:
for j in range(groups[i]):
result[idxs[index]] += increment
index += 1
return result
[docs]
def rectify_charges(q_list, net_charge=None, decimals=3) -> list[float]:
"""
Makes charges N (default is 3) decimals in length and ensures they sum to an integer.
Parameters
----------
q_list : list[float]
List of charges to be rectified.
net_charge : int or None
The desired net charge. If None, it will be calculated from the sum of the charges in q_list.
decimals : int
The number of decimal places to round the charges to. Default is 3.
Returns
-------
charges_dec: list[float]
List of rectified charges, rounded to the specified number of decimal places.
"""
fstr = "%%.%df" % decimals
charges_dec = [float(fstr % q) for q in q_list]
if net_charge is None:
net_charge = _snap_to_int(sum(charges_dec), tolerance=0.15)
if net_charge is None:
msg = "net charge could not be predicted from input q_list. (residual is beyond tolerance) "
msg = "Please set the net_charge argument directly"
raise RuntimeError(msg)
elif type(net_charge) != int:
raise TypeError("net charge must be an integer")
surplus = net_charge - sum(charges_dec)
surplus_int = _snap_to_int(10**decimals * surplus)
if surplus_int == 0:
return charges_dec
weights = [abs(q) for q in q_list]
surplus_int_splits = divide_int_gracefully(surplus_int, weights)
for i, increment in enumerate(surplus_int_splits):
charges_dec[i] += 10**-decimals * increment
return charges_dec
[docs]
def get_updated_positions(monomer, new_positions: dict):
"""
Returns full set of positions for the rdkit_mol in monomer given a partial
set of new_positions. Hydrogens not specified in new_positions will
have their position reset by RDKit if they are one or two bonds away
from an atom in new_positions.
Parameters
----------
monomer : Monomer
Must have rdkit_mol to associate with new positions.
new_positions : dict[int, (float, float, float)]
Dictionary mapping the atom indices to their new positions.
Returns
-------
positions : np.ndarray
The updated positions of the atoms in the RDKit molecule.
"""
h_to_update = set()
mol = Chem.Mol(monomer.rdkit_mol) # avoids side effects
conformer = mol.GetConformer()
for n1 in (mol.GetAtomWithIdx(idx) for idx in new_positions):
for n2 in n1.GetNeighbors():
if n2.GetAtomicNum() == 1: # 1 bond away
h_to_update.add(n2.GetIdx())
else:
if n2.GetIdx() not in new_positions: # 2 bonds away
h_to_update.update(set(n2h.GetIdx() for n2h in n2.GetNeighbors() if n2h.GetAtomicNum() == 1))
# hydrogens in new_positions shall not be updated by RDKit
h_to_update -= set(new_positions)
for index in new_positions:
x, y, z = new_positions[index]
p = Point3D(float(x), float(y), float(z))
conformer.SetAtomPosition(index, p)
if h_to_update:
update_H_positions(mol, list(h_to_update))
return mol.GetConformer().GetPositions()
[docs]
def update_H_positions(mol: Chem.Mol, indices_to_update: list[int]) -> None:
"""
Re-calculates the position of some hydrogens already existing in the mol. Does not guarantee that chirality can be
preserved.
Parameters
----------
mol : Chem.Mol
RDKit Mol object with hydrogens.
indices_to_update : list[int]
Hydrogen indices to update.
Returns
-------
None
Raises
------
RuntimeError:
If a provided index in indices_to_update is not a Hydrogen, if a Hydrogen only has Hydrogen neighbors, or if the
number of visited Hydrogens does not match the number of Hydrogens marked to be deleted.
"""
# Gets the conformer and a readable and writable version of the Mol using RDKit
conf = mol.GetConformer()
tmpmol = Chem.RWMol(mol)
# Sets up data structures to manage Hydrogens to delete and add
to_del = {}
to_add_h = []
# Loops through input indices_to_update, checks index validity, adds data to the addition and deletion data structs
for h_index in indices_to_update:
# Checks that the atom at this index is a Hydrogen
atom = tmpmol.GetAtomWithIdx(h_index)
if atom.GetAtomicNum() != 1:
raise RuntimeError("only H positions can be updated")
# Ensures that all Hydrogens have at least 1 non-Hydrogen neighbor
heavy_neighbors = []
for neigh_atom in atom.GetNeighbors():
if neigh_atom.GetAtomicNum() != 1:
heavy_neighbors.append(neigh_atom)
if len(heavy_neighbors) != 1:
raise RuntimeError(
f"hydrogens must have 1 non-H neighbor, got {len(heavy_neighbors)}"
)
# Adds the first neighbor to the addition and deletion data structures.
to_add_h.append(heavy_neighbors[0])
to_del[h_index] = heavy_neighbors[0]
# Loops through the delete list and deletes the
for i in sorted(to_del, reverse=True):
tmpmol.RemoveAtom(i)
to_del[i].SetNumExplicitHs(to_del[i].GetNumExplicitHs() + 1)
to_add_h = list(set([atom.GetIdx() for atom in to_add_h]))
tmpmol = tmpmol.GetMol()
tmpmol.UpdatePropertyCache()
Chem.SanitizeMol(tmpmol)
tmpmol = Chem.AddHs(tmpmol, onlyOnAtoms=to_add_h, addCoords=True)
tmpconf = tmpmol.GetConformer()
used_h = (
set()
) # heavy atom may have multiple H that were missing, keep track of Hs that were visited
for h_index, parent in to_del.items():
for atom in tmpmol.GetAtomWithIdx(parent.GetIdx()).GetNeighbors():
has_new_position = atom.GetIdx() >= mol.GetNumAtoms() - len(to_del)
if atom.GetAtomicNum() == 1 and has_new_position:
if atom.GetIdx() not in used_h:
# print(h_index, tuple(tmpconf.GetAtomPosition(atom.GetIdx())))
conf.SetAtomPosition(
h_index, tmpconf.GetAtomPosition(atom.GetIdx())
)
used_h.add(atom.GetIdx())
break # h_index coords copied, don't look into further H bound to parent
# no guarantees about preserving chirality, which we don't need
if len(used_h) != len(to_del):
raise RuntimeError(
f"Updated {len(used_h)} H positions but deleted {len(to_del)}"
)
return
def _delete_residues(res_to_delete, raw_input_mols):
"""
Deletes residues from raw_input_mols that are not needed for the Polymer object.
Parameters
----------
res_to_delete : list[str] or None
residue IDs to delete in format <chain>:<resnum><icode>
raw_input_mols : dict[str, Chem.Mol]
keys are residue IDs
Returns
-------
None
Raises
------
ValueError:
If any of the residues to delete are not found in raw_input_mols.
"""
if res_to_delete is None:
return
missing = set()
for res in res_to_delete:
if res not in raw_input_mols:
missing.add(res)
raw_input_mols.pop(res, None)
if len(missing) > 0:
msg = "can't find the following residues to delete: " + " ".join(missing)
raise ValueError(msg)
return
[docs]
class PolymerCreationError(RuntimeError):
"""
Exception raised when the creation of a Polymer object fails.
Attributes
----------
error : str
The main error message.
recommendations : str
Recommendations for resolving the error.
traceback : str
The traceback of the error.
"""
def __init__(self, error: str, recommendations: str = None):
"""
Initialize the PolymerCreationError.
Parameters
----------
error : str
The main error message.
recommendations : str
Recommendations for resolving the error.
"""
super().__init__(error) # main error message to pass to RuntimeError
self.error = error
self.recommendations = recommendations
exc_type, exc_value, exc_traceback = exc_info()
if exc_value is not None:
self.traceback = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback))
else:
self.traceback = None
def __str__(self):
"""
Return a string representation of the error message.
Returns
-------
str
The error message.
"""
msg = "" + eol
msg += "Error: Creation of data structure for receptor failed." + eol
msg += "" + eol
msg += "Details:" + eol
msg += self.error + eol
if self.traceback:
msg += self.traceback + eol
if self.recommendations:
msg += "Recommendations:" + eol
msg += self.recommendations + eol
msg += "" + eol
return msg
[docs]
def handle_parsing_situations(
unmatched_res,
unparsed_res,
allow_bad_res,
res_missed_altloc,
res_needed_altloc,
):
"""
Handle situations where residues are not parsed correctly or do not match templates.
Parameters
----------
unmatched_res : set
Residues that did not match any template.
unparsed_res : set
Residues that could not be parsed.
allow_bad_res : bool
If True, allows residues that do not match templates to be ignored.
res_missed_altloc : set
Residues that were requested with an alternate location but were not found.
res_needed_altloc : set
Residues that were found with an alternate location but were not requested.
Raises
------
PolymerCreationError
If there are residues that could not be parsed or matched to templates, and allow_bad_res is False.
"""
err = ""
if unparsed_res:
msg = f"- Parsing failed for: {unparsed_res}."
if not allow_bad_res:
err += msg + eol
else:
msg += " Ignored due to allow_bad_res."
logger.warning(msg)
if unmatched_res:
msg = f"- Template matching failed for: {list(unmatched_res)}"
if not allow_bad_res:
err += msg + eol
else:
msg += " Ignored due to allow_bad_res."
logger.warning(msg)
if err:
err += "These residues can be ignored with option allow_bad_res." + eol
if res_needed_altloc:
msg = f"- Residues with alternate location: {res_needed_altloc}" + eol
msg += "Either specify an altloc for each with option wanted_altloc" + eol
msg += "or a general default altloc with option default_altloc."
err += msg
if res_missed_altloc:
msg = f"- Requested altlocs not found for: {res_missed_altloc}." + eol
err += msg
if err:
recs = "1. (for batch processing) Use -a/--allow_bad_res to automatically remove residues" + eol
recs += "that do not match templates, and --default_altloc to set" + eol
recs += "a default altloc variant. Use these at your own risk." + eol
recs += "" + eol
recs += "2. (processing individual structure) Inspecting and fixing the input structure is recommended." + eol
recs += "Use --wanted_altloc to set variants for specific residues."
raise PolymerCreationError(err, recs)
return
[docs]
class ResidueChemTemplates(BaseJSONParsable):
"""
Holds template data required to initialize Polymer
Attributes
----------
residue_templates: dict[str, ResidueTemplate]
keys are the ID of an instance of ResidueTemplate
padders : dict
instances of ResiduePadder keyed by a link_label (a string)
link_labels establish the relationship between ResidueTemplates
and ResiduePadders, determining which padder is to be used to
pad each atom of an instance of Monomer that needs padding.
ambiguous : dict[str, list[str]]
mapping between input residue names (e.g. the three-letter residue
name from PDB files) and IDs (strings) of ResidueTemplates
"""
def __init__(self, residue_templates, padders, ambiguous):
"""
Initialize ResidueChemTemplates.
Parameters
----------
residue_templates : dict[str, ResidueTemplate]
A dictionary of residue templates.
Keys are the IDs of the templates, and values are instances of ResidueTemplate.
padders : dict[str, ResiduePadder]
A dictionary of residue padders.
Keys are the IDs of the padders, and values are instances of ResiduePadder.
ambiguous : dict[str, list[str]]
Mapping between input residue names (e.g. the three-letter residue
name from PDB files) and IDs (strings) of ResidueTemplates.
"""
self._check_missing_padders(residue_templates, padders)
self._check_ambiguous_reskeys(residue_templates, ambiguous)
self.residue_templates = residue_templates
self.padders = padders
self.ambiguous = ambiguous
# region JSON-interchange functions
[docs]
@classmethod
def json_encoder(cls, obj: "ResidueChemTemplates") -> Optional[dict[str, Any]]:
output_dict = {
"residue_templates": {
k: ResidueTemplate.json_encoder(v)
for k, v in obj.residue_templates.items()
},
"ambiguous": obj.ambiguous,
"padders": {
k: ResiduePadder.json_encoder(v)
for k, v in obj.padders.items()
},
}
return output_dict
# Keys to check for deserialized JSON
expected_json_keys = {
"residue_templates",
"ambiguous",
"padders",
}
@classmethod
def _decode_object(cls, obj: dict[str, Any]):
# Extracting the constructor args from the json representation and creating a ResidueChemTemplates instance
templates = {
k: ResidueTemplate.from_dict(v) for k, v in obj["residue_templates"].items()
}
padders = {k: ResiduePadder.from_dict(v) for k, v in obj["padders"].items()}
residue_chem_templates = cls(templates, padders, obj["ambiguous"])
return residue_chem_templates
# endregion
[docs]
def add_dict(self, data, overwrite=False):
"""
Add or update data from a dictionary to the current instance.
Parameters
----------
data : dict
A dictionary containing new data to be added.
The dictionary may contain keys within "residue_templates", "padders", and "ambiguous".
overwrite : bool, optional
If True, existing data will be overwritten with new data.
If False, new data will be added without overwriting existing data.
Default is False.
"""
bad_keys = set(data) - {"ambiguous", "residue_templates", "padders"}
if bad_keys:
logging.warning(f"Ignore unexpected keys: {bad_keys}")
new_ambiguous = data.get("ambiguous", {})
if overwrite:
self.ambiguous.update(new_ambiguous)
else:
new_ambiguous = {k: v.copy() for k, v in new_ambiguous.items()}
new_ambiguous.update(self.ambiguous)
self.ambiguous = new_ambiguous
for key, value in data.get("residue_templates", {}).items():
if overwrite or key not in self.residue_templates:
res_template = ResidueTemplate.from_dict(value)
self.residue_templates[key] = res_template
for link_label, value in data.get("padders", {}).items():
if overwrite or key not in self.padders:
padder = ResiduePadder.from_dict(data)
self.padders[link_label] = padder
return
[docs]
@staticmethod
def lookup_filename(filename, data_path):
"""
Look for a file in the current directory or in the data_path directory.
If the file is not found in either location, raise a ValueError.
If the file is found in the data_path directory, return the full path.
If the file is found in the current directory, return the filename.
Parameters
----------
filename : str
The name of the file to look for.
data_path : pathlib.Path
The path to the directory where the file may be located.
Returns
-------
str
The full path to the file if found, otherwise raises ValueError.
Raises
------
ValueError
If the file is not found in either the current directory or the data_path directory.
"""
p = pathlib.Path(filename)
if not p.exists():
if (data_path / p).exists():
filename = str(data_path / p)
elif (data_path / (p.name + ".json")).exists():
filename = str(data_path / (p.name + ".json"))
else:
raise ValueError(f"can't find {filename} in current dir or {data_path}")
return filename
[docs]
@classmethod
def from_json_file(cls, filename):
"""
Create an instance of ResidueChemTemplates from a JSON file.
Parameters
----------
filename : str
The name of the JSON file to read.
The file may contain residue templates, padders, and ambiguous residue names.
Returns
-------
ResidueChemTemplates
An instance of ResidueChemTemplates created from the JSON file.
"""
filename = cls.lookup_filename(filename, data_path)
with open(filename) as f:
jsonstr = f.read()
alldata = json.loads(jsonstr)
ambiguous = {k: v.copy() for k,v in alldata.get("ambiguous", {}).items()}
residue_templates = {}
padders = {}
for key, data in alldata.get("residue_templates", {}).items():
res_template = ResidueTemplate.from_dict(data)
residue_templates[key] = res_template
for link_label, data in alldata.get("padders", {}).items():
padders[link_label] = ResiduePadder.from_dict(data)
return cls(residue_templates, padders, ambiguous)
[docs]
@classmethod
def create_from_defaults(cls):
"""
Create an instance of ResidueChemTemplates using default data (residue_chem_templates.json).
Returns
-------
ResidueChemTemplates
An instance of ResidueChemTemplates created from the default JSON file.
"""
return cls.from_json_file("residue_chem_templates")
[docs]
def add_json_file(self, filename):
"""
Add data from a JSON file to the current instance.
Parameters
----------
filename : str
The name of the JSON file to read.
The file may contain residue templates, padders, and ambiguous residue names.
Returns
-------
None
"""
filename = self.lookup_filename(filename, data_path)
with open(filename) as f:
jsonstr = f.read()
data = json.loads(jsonstr)
self.add_dict(data)
return
@staticmethod
def _check_missing_padders(residue_templates, padders):
# can't guarantee full coverage because the topology that is passed
# to the Polymer may contain bonds between residues that are not
# anticipated to be bonded, for example, protein N-term bonded to
# nucleic acid 5 prime.
# collect labels from residues
link_labels_in_residues = set()
for reskey, res_template in residue_templates.items():
for _, link_label in res_template.link_labels.items():
link_labels_in_residues.add(link_label)
# and check we have padders for all of them
link_labels_in_padders = set([label for label in padders])
# for link_label in padders:
# for (link_labels) in padder.link_labels:
# print(link_key, link_labels)
# for (label, _) in link_labels: # link_labels is a list of pairs
# link_labels_in_padders.add(label)
missing = link_labels_in_residues.difference(link_labels_in_padders)
if missing:
raise RuntimeError(f"missing padders for {missing}")
return
@staticmethod
def _check_ambiguous_reskeys(residue_templates, ambiguous):
missing = {}
for input_resname, reskeys in ambiguous.items():
for reskey in reskeys:
if reskey not in residue_templates:
missing.setdefault(input_resname, set())
missing[input_resname].add(reskey)
if len(missing):
raise ValueError(f"missing residue templates for ambiguous: {missing}")
return
[docs]
class Polymer(BaseJSONParsable):
"""
Represents polymer with its subunits as individual RDKit molecules.
Used for proteins and nucleic acids. The key class is Monomer,
which contains, a padded RDKit molecule containing part of the adjacent
residues to enable chemically meaningful parameterizaion.
Instances of ResidueTemplate make sure that the input, which may originate
from a PDB string, matches the RDKit molecule of the template, even if
hydrogens are missing.
Attributes
----------
monomers : dict[str, Monomer]
Dictionary to store monomers where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and values are instances of Monomer.
log : dict[str, list[str]]
Dictionary to store log messages during monomer processing (_get_monomer).
residue_chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class used to initialize this Polymer.
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary of raw input mols used to initialize this Polymer, where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and values are tuples of an RDKit Mols and input resname.
"""
def __init__(
self,
raw_input_mols: dict[str, tuple[Chem.Mol, str]],
bonds: dict[tuple[str, str], tuple[int, int]],
residue_chem_templates: ResidueChemTemplates,
mk_prep=None,
set_template: dict[str, str] = None,
blunt_ends: list[tuple[str, int]] = None,
get_atomprop_from_raw: dict = None,
):
"""
Initialize Polymer.
Parameters
----------
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary of raw input mols where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and
values are tuples of an RDKit Mols and input resname.
RDKit Mols will be matched to instances of ResidueTemplate, and may contain none, all, or some of the
Hydrogens.
bonds : dict[tuple[str, str], tuple[int, int]]
A dictionary of inter-residue bonds where keys are tuples of residue IDs in the format <chain>:<resnum> such as "A:42" and
values are tuples of atom indices.
residue_chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class.
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to parameterize the padded molecules.
set_template : dict[str, str]
A dict mapping residue IDs in the format <chain>:<resnum> such as "A:42" to ResidueTemplate instances.
blunt_ends : list[tuple[str, int]]
A list of tuples where each tuple is residue IDs and 0-based atom index, e.g.; ("A:42", 0)
get_atomprop_from_raw : dict[str, any]
A dictionary mapping atom properties to pass from raw_input_mols. The keys are the property names and the values are the default values.
Returns
-------
None
Raises
------
ValueError
If raw_input_mols is not a dictionary or if any of the residues in set_template are not found in raw_input_mols.
PolymerCreationError
If there are residues that could not be parsed or matched to templates, and allow_bad_res is False.
"""
# TODO simplify SMARTS for adjacent res in padders
if type(raw_input_mols) != dict:
msg = f"expected raw_input_mols to be dict, got {type(raw_input_mols)}"
if type(raw_input_mols) == str:
msg += eol
msg += (
"consider Polymer.from_pdb_string(pdbstr)" + eol
)
raise ValueError(msg)
self.residue_chem_templates = residue_chem_templates
residue_templates = residue_chem_templates.residue_templates
padders = residue_chem_templates.padders
ambiguous = residue_chem_templates.ambiguous
if set_template is None:
set_template = {}
else: # make sure all resiude_id in set_template exist
missing = set(
[
residue_id
for residue_id in set_template
if residue_id not in raw_input_mols
]
)
if len(missing):
raise ValueError(
f"Residue IDs in set_template not found: {missing} {raw_input_mols.keys()}"
)
# check if input assigned residue name in residue_templates
err = ""
supported_resnames = residue_templates.keys() | ambiguous.keys()
unknown_res_from_input = {res_id: raw_input_mols[res_id][1]
for res_id in raw_input_mols
if res_id not in set_template and raw_input_mols[res_id][1] not in supported_resnames
}
if unknown_res_from_input:
unknown_valid_res_from_input = {k: v for k, v in unknown_res_from_input.items() if v != "UNL"}
if unknown_valid_res_from_input:
err += f"Input residues {unknown_valid_res_from_input} not in residue_templates" + eol
UNL_from_input = {k: v for k, v in unknown_res_from_input.items() if v == "UNL"}
if UNL_from_input:
err += f"Input residues {UNL_from_input} do not have a concrete definition" + eol
unknown_res_from_assign = {}
if set_template:
unknown_res_from_assign = {res_id: resn for res_id, resn in set_template.items() if resn not in supported_resnames}
unknown_valid_res_from_assign = {k: v for k, v in unknown_res_from_assign.items() if v != "UNL"}
if unknown_valid_res_from_assign:
err += f"Input residues {unknown_valid_res_from_assign} not in residue_templates" + eol
UNL_from_assign = {k: v for k, v in unknown_res_from_assign.items() if v == "UNL"}
if UNL_from_assign:
err += f"Input residues {UNL_from_assign} do not have a concrete definition" + eol
if err:
if "UNL" in err:
err += "Resdiues that are named UNL can't be parameterized. " + eol
rec = "1. (to parameterize the residues) Use --set_template to specify valid residue names, " + eol
rec += "2. (to skip the residues) Use --delete_residues to ignore them. Residues will be deleted from the prepared receptor. "
raise PolymerCreationError(err, rec)
warnings.warn(err, RuntimeWarning)
warnings.warn("Trying to resolve unknown residues by building chemical templates... ", RuntimeWarning)
all_unknown_res = unknown_res_from_input.copy()
all_unknown_res.update(unknown_res_from_assign)
bonded_unknown_res = {res_id: all_unknown_res[res_id] for res_id in all_unknown_res
if any(res_id in respair for respair in bonds)}
unbound_unknown_res = all_unknown_res.copy()
for key in bonded_unknown_res:
unbound_unknown_res.pop(key, None)
if unbound_unknown_res:
for resname in set(unbound_unknown_res.values()):
try:
cc = build_noncovalent_CC(resname)
fetch_template_dict = json.loads(export_chem_templates_to_json([cc]))['residue_templates'][cc.resname]
residue_templates.update({resname: ResidueTemplate(
smiles = fetch_template_dict['smiles'],
atom_names = fetch_template_dict['atom_name'],
link_labels = fetch_template_dict['link_labels'])})
ambiguous[resname] = [cc.resname]
except Exception as e:
logger.warning(f"Failed building template from CCD for {resname=}")
raise PolymerCreationError(str(e))
if bonded_unknown_res:
failed_build = set()
try:
for resname in set(bonded_unknown_res.values()):
cc_list = build_linked_CCs(resname)
if not cc_list:
failed_build.add(resname)
else:
for cc in cc_list:
fetch_template_dict = json.loads(export_chem_templates_to_json([cc]))['residue_templates'][cc.resname]
residue_templates.update({cc.resname: ResidueTemplate(
smiles = fetch_template_dict['smiles'],
atom_names = fetch_template_dict['atom_name'],
link_labels = convert_to_int_keyed_dict(fetch_template_dict['link_labels']))})
if resname in ambiguous:
ambiguous[resname].append(cc.resname)
else:
ambiguous[resname] = [cc.resname]
except Exception as e:
raise PolymerCreationError(str(e))
if failed_build:
raise PolymerCreationError(f"Template generation failed for unknown residues: {failed_build}, which appear to be linking fragments. " + eol
+ "Generation of chemical templates with modified backbones, which involves guessing of linker positions and types, are not currently supported. ",
"1. (to parameterize the residues) Use --add_templates to pass the additional templates with valid linker_labels, " + eol
+ "2. (to skip the residues) Use --delete_residues to ignore them. Residues will be deleted from the prepared receptor. ")
self.monomers, self.log = self._get_monomers(
raw_input_mols,
ambiguous,
residue_chem_templates,
set_template,
bonds,
blunt_ends,
)
_bonds = {}
for key, bond_list in bonds.items():
monomer1 = self.monomers[key[0]]
monomer2 = self.monomers[key[1]]
if monomer1.rdkit_mol is None or monomer2.rdkit_mol is None:
continue
invmap1 = {j: i for i, j in monomer1.mapidx_to_raw.items()}
invmap2 = {j: i for i, j in monomer2.mapidx_to_raw.items()}
_bonds[key] = [(invmap1[b[0]], invmap2[b[1]]) for b in bond_list]
bonds = _bonds
# padding may seem overkill but we had to run a reaction anyway for h_coord_from_dipep
padded_mols = self._build_padded_mols(self.monomers, bonds, padders)
for residue_id, (padded_mol, mapidx_from_pad) in padded_mols.items():
monomer = self.monomers[residue_id]
monomer.padded_mol = padded_mol
monomer.molsetup_mapidx = mapidx_from_pad
if mk_prep is not None:
self.parameterize(mk_prep, get_atomprop_from_raw = get_atomprop_from_raw)
return
# region JSON-interchange functions
[docs]
@classmethod
def json_encoder(cls, obj: "Polymer") -> Optional[dict[str, Any]]:
output_dict = {
"residue_chem_templates": ResidueChemTemplates.json_encoder(
obj.residue_chem_templates
),
"monomers": {
k: Monomer.json_encoder(v)
for k, v in obj.monomers.items()
},
"log": obj.log,
}
return output_dict
# Keys to check for deserialized JSON
expected_json_keys = {
"residue_chem_templates",
"monomers",
"log",
}
@classmethod
def _decode_object(cls, obj: dict[str, Any]):
# Deserializes ResidueChemTemplates from the dict to use as an input, then constructs a Polymer object
# and sets its values using deserialized JSON values.
residue_chem_templates = ResidueChemTemplates.from_dict(
obj["residue_chem_templates"]
)
polymer = cls({}, {}, residue_chem_templates)
polymer.monomers = {
k: Monomer.from_dict(v) for k, v in obj["monomers"].items()
}
polymer.log = obj["log"]
return polymer
# endregion
[docs]
def stitch(self, residues_to_add: Optional[set[str]] = None,
bonds_to_use: Optional[dict[tuple[str], list[tuple[int]]]] = None):
"""
Function to stitch together monomers into a single molecule.
Parameters
----------
residues_to_add : set[str], optional
A set of residue IDs to add to the stitched molecule.
If None, all valid monomers will be added. Default is None (all valid monomers).
bonds_to_use : dict[tuple[str], list[tuple[int]]], optional
A dictionary of bonds to use for stitching.
The keys are tuples of residue IDs, and the values are lists of tuples of atom indices (in rdkit_mol).
If None, all available bonds in the polymer will be used. Default is None (all available bonds).
Returns
-------
Chem.Mol
An RDKit molecule that results from adding bonds between the specified residues.
It may contain multiple fragments if there are multiple chains or gaps.
"""
# stitching all valid monomers by default
valid_monomers = set(self.get_valid_monomers().keys())
residues_to_add = residues_to_add or valid_monomers
residues_to_add = set(residues_to_add)
# verify if requested monomers are valid (have rdkit_mol)
invalid_monomers = residues_to_add - valid_monomers
if invalid_monomers:
raise ValueError(f"Residue IDs not in valid monomers: {invalid_monomers}")
if bonds_to_use is None:
bonds_to_use = {}
resid_to_rawmols = {res_id: (self.monomers[res_id].raw_rdkit_mol, self.monomers[res_id].input_resname) for res_id in residues_to_add}
bonds_indexed_in_raw = find_inter_mols_bonds(resid_to_rawmols)
invmaps = {
res_id: {j: i for i, j in self.monomers[res_id].mapidx_to_raw.items()}
for res_id in residues_to_add
}
for (res1, res2), bond_list in bonds_indexed_in_raw.items():
invmap1, invmap2 = invmaps[res1], invmaps[res2]
bonds_to_use[(res1, res2)] = [(invmap1[b[0]], invmap2[b[1]]) for b in bond_list]
# initialize mol and residue/bond tracking
mol = Chem.Mol()
residues_added = {}
bonds_spent = set()
# add residues and get offset in order
offset = 0
for r_id in residues_to_add:
res = self.monomers[r_id]
residues_added[r_id] = offset
offset += res.rdkit_mol.GetNumAtoms()
mol = Chem.CombineMols(mol, res.rdkit_mol)
# add bonds
edit_mol = Chem.EditableMol(mol)
for bond_key, bond_list in bonds_to_use.items():
if bond_key in bonds_spent:
continue
r1, r2 = bond_key
if r1 in residues_added and r2 in residues_added:
bonds_spent.add(bond_key)
for bond in bond_list:
i, j = bond
edit_mol.AddBond(
i + residues_added[r1],
j + residues_added[r2],
order=Chem.rdchem.BondType.SINGLE
)
mol = edit_mol.GetMol()
# review added bonds and residues
if len(bonds_spent) != len(bonds_to_use):
raise RuntimeError("nr of bonds added differs from bonds to use")
if len(residues_added) != len(residues_to_add):
raise RuntimeError("nr of residues added differs from residues to add")
return mol
[docs]
@classmethod
def from_pdb_string(
cls,
pdb_string,
chem_templates,
mk_prep,
set_template=None,
residues_to_delete=None,
allow_bad_res=False,
bonds_to_delete=None,
blunt_ends=None,
wanted_altloc=None,
default_altloc=None
):
"""
Construct a Polymer object from a PDB string.
Parameters
----------
pdb_string : str
The PDB string containing the polymer structure.
chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class to construct the polymer.
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to construct the polymer.
set_template : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names.
If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used).
residues_to_delete : set[str], optional
A set of residue IDs to delete from the polymer.
If None, no residues will be deleted. Default is None (no residues will be deleted).
allow_bad_res : bool, optional
If True, allows residues that do not match templates to be ignored (rdkit_mol will be None).
If False, raises an error if any residues do not match templates. Default is False.
bonds_to_delete : list[tuple[str, str]], optional
A list of tuples of residue IDs to delete bonds between.
If None, no bonds will be deleted. Default is None (no bonds will be deleted).
blunt_ends : list[tuple[str, int]], optional
A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol).
If None, no blunt ends will be added. Default is None (no blunt ends will be added).
wanted_altloc : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue.
default_altloc : str, optional
A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified.
Returns
-------
Polymer
An instance of the Polymer class constructed from the PDB string.
Raises
------
NotImplementedError
If bonds_to_delete includes residue pairs with more than bond2 between them.
PolymerCreationError
If there are residues that could not be parsed or matched to templates, and allow_bad_res is False.
"""
tmp_raw_input_mols = cls._pdb_to_residue_mols(
pdb_string,
wanted_altloc,
default_altloc,
)
# from here on it duplicates self.from_prody(), but extracting
# this out into a function felt like it sacrificed readibility
# so I decided to keep the duplication.
_delete_residues(residues_to_delete, tmp_raw_input_mols)
raw_input_mols = {}
res_needed_altloc = []
res_missed_altloc = []
unparsed_res = []
for res_id, stuff in tmp_raw_input_mols.items():
mol, resname, missed_altloc, needed_altloc = stuff
if mol is None and missed_altloc:
res_missed_altloc.append(res_id)
elif mol is None and needed_altloc:
res_needed_altloc.append(res_id)
elif mol is None:
unparsed_res.append(res_id)
else:
raw_input_mols[res_id] = (mol, resname)
bonds = find_inter_mols_bonds(raw_input_mols)
if bonds_to_delete is not None:
for res1, res2 in bonds_to_delete:
popped = ()
if (res1, res2) in bonds:
popped = bonds.pop((res1, res2))
elif (res2, res1) in bonds:
popped = bonds.pop((res2, res1))
if len(popped) >= 2:
msg = (
"can't delete bonds for residue pairs that have more"
" than one bond between them"
)
raise NotImplementedError(msg)
polymer = cls(
raw_input_mols,
bonds,
chem_templates,
mk_prep,
set_template,
blunt_ends,
)
unmatched_res = polymer.get_ignored_monomers()
handle_parsing_situations(
unmatched_res,
unparsed_res,
allow_bad_res,
res_missed_altloc,
res_needed_altloc,
)
return polymer
# region adapted from from_pdb_string
[docs]
@classmethod
def from_pqr_string(
cls,
pqr_string,
chem_templates,
mk_prep,
set_template=None,
residues_to_delete=None,
allow_bad_res=False,
bonds_to_delete=None,
blunt_ends=None,
):
"""
Construct a Polymer object from a PQR string. Adapted from PDB2PQR.
Parameters
----------
pqr_string : str
The PQR string containing the polymer structure.
chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class to construct the polymer.
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to construct the polymer.
set_template : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names.
If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used).
residues_to_delete : set[str], optional
A set of residue IDs to delete from the polymer.
If None, no residues will be deleted. Default is None (no residues will be deleted).
allow_bad_res : bool, optional
If True, allows residues that do not match templates to be ignored (rdkit_mol will be None).
If False, raises an error if any residues do not match templates. Default is False.
bonds_to_delete : list[tuple[str, str]], optional
A list of tuples of residue IDs to delete bonds between.
If None, no bonds will be deleted. Default is None (no bonds will be deleted).
blunt_ends : list[tuple[str, int]], optional
A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol).
If None, no blunt ends will be added. Default is None (no blunt ends will be added).
Returns
-------
Polymer
An instance of the Polymer class constructed from the PDB string.
Raises
------
NotImplementedError
If bonds_to_delete includes residue pairs with more than bond2 between them.
PolymerCreationError
If there are residues that could not be parsed or matched to templates, and allow_bad_res is False.
"""
tmp_raw_input_mols = cls._pqr_to_residue_mols(
pqr_string,
)
# from here on it duplicates self.from_prody(), but extracting
# this out into a function felt like it sacrificed readibility
# so I decided to keep the duplication.
_delete_residues(residues_to_delete, tmp_raw_input_mols)
raw_input_mols = {}
res_needed_altloc = []
res_missed_altloc = []
unparsed_res = []
for res_id, stuff in tmp_raw_input_mols.items():
mol, resname, missed_altloc, needed_altloc = stuff
if mol is None and missed_altloc:
res_missed_altloc.append(res_id)
elif mol is None and needed_altloc:
res_needed_altloc.append(res_id)
elif mol is None:
unparsed_res.append(res_id)
else:
raw_input_mols[res_id] = (mol, resname)
bonds = find_inter_mols_bonds(raw_input_mols)
if bonds_to_delete is not None:
for res1, res2 in bonds_to_delete:
popped = ()
if (res1, res2) in bonds:
popped = bonds.pop((res1, res2))
elif (res2, res1) in bonds:
popped = bonds.pop((res2, res1))
if len(popped) >= 2:
msg = (
"can't delete bonds for residue pairs that have more"
" than one bond between them"
)
raise NotImplementedError(msg)
polymer = cls(
raw_input_mols,
bonds,
chem_templates,
mk_prep,
set_template,
blunt_ends,
get_atomprop_from_raw = {"PQRCharge": 0.},
)
if polymer.log["matched_with_H_anomaly"]:
msg = ""
for res_id, (template_name, h_info) in polymer.log["matched_with_H_anomaly"].items():
h_miss = h_info.get('H_miss', 0)
h_excess = h_info.get('H_excess', 0)
msg += f"Residue {res_id} matched with template '{template_name}' has H discrepancy: {h_miss} missing, {h_excess} excess. \n"
raise PolymerCreationError(msg + "These discrepancies may compromise the validity of the charge assignment from PQR, making the charges inapplicable to the processed receptor. \n")
unmatched_res = polymer.get_ignored_monomers()
handle_parsing_situations(
unmatched_res,
unparsed_res,
allow_bad_res,
res_missed_altloc,
res_needed_altloc,
)
return polymer
# endregion
[docs]
@classmethod
def from_prody(
cls,
prody_obj: Union[Selection, AtomGroup],
chem_templates,
mk_prep,
set_template=None,
residues_to_delete=None,
allow_bad_res=False,
bonds_to_delete=None,
blunt_ends=None,
wanted_altloc: Optional[dict]=None,
default_altloc: Optional[str]=None,
):
"""
Construct a Polymer object from a ProDy Selection or AtomGroup object.
Parameters
----------
prody_obj : ProDy.Selection or ProDy.AtomGroup
The ProDy object to construct the polymer from.
chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class to construct the polymer.
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to construct the polymer.
set_template : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names.
If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used).
residues_to_delete : set[str], optional
A set of residue IDs to delete from the polymer.
If None, no residues will be deleted. Default is None (no residues will be deleted).
allow_bad_res : bool, optional
If True, allows residues that do not match templates to be ignored (rdkit_mol will be None).
If False, raises an error if any residues do not match templates. Default is False.
bonds_to_delete : list[tuple[str, str]], optional
A list of tuples of residue IDs to delete bonds between.
If None, no bonds will be deleted. Default is None (no bonds will be deleted).
blunt_ends : list[tuple[str, int]], optional
A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol).
If None, no blunt ends will be added. Default is None (no blunt ends will be added).
wanted_altloc : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue.
default_altloc : str, optional
A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified.
Returns
-------
Polymer
An instance of the Polymer class constructed from the PDB string.
Raises
------
NotImplementedError
If bonds_to_delete includes residue pairs with more than bond2 between them.
PolymerCreationError
If there are residues that could not be parsed or matched to templates, and allow_bad_res is False.
"""
tmp_raw_input_mols = cls._prody_to_residue_mols(
prody_obj,
wanted_altloc,
default_altloc,
)
# from here on it duplicates self.from_pdb_string(), but extracting
# this out into a function felt like it sacrificed readibility
# so I decided to keep the duplication.
_delete_residues(residues_to_delete, tmp_raw_input_mols)
raw_input_mols = {}
res_needed_altloc = []
res_missed_altloc = []
unparsed_res = []
for res_id, stuff in tmp_raw_input_mols.items():
mol, resname, missed_altloc, needed_altloc = stuff
if mol is None and missed_altloc:
res_missed_altloc.append(res_id)
elif mol is None and needed_altloc:
res_needed_altloc.append(res_id)
elif mol is None:
unparsed_res.append(res_id)
else:
raw_input_mols[res_id] = (mol, resname)
bonds = find_inter_mols_bonds(raw_input_mols)
if bonds_to_delete is not None:
for res1, res2 in bonds_to_delete:
popped = ()
if (res1, res2) in bonds:
popped = bonds.pop((res1, res2))
elif (res2, res1) in bonds:
popped = bonds.pop((res2, res1))
if len(popped) >= 2:
msg = (
"can't delete bonds for residue pairs that have more"
" than one bond between them"
)
raise NotImplementedError(msg)
polymer = cls(
raw_input_mols,
bonds,
chem_templates,
mk_prep,
set_template,
blunt_ends,
)
unmatched_res = polymer.get_ignored_monomers()
handle_parsing_situations(
unmatched_res,
unparsed_res,
allow_bad_res,
res_missed_altloc,
res_needed_altloc,
)
return polymer
[docs]
def parameterize(self, mk_prep, get_atomprop_from_raw = None):
"""
Parameterize the monomers in the polymer using the provided MoleculePreparation instance.
Parameters
----------
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to parameterize the monomers.
get_atomprop_from_raw : dict, optional
A dictionary mapping atom properties to pass from raw_input_mols. The keys are the property names and the values are the default values.
Returns
-------
None
"""
for residue_id in self.get_valid_monomers():
self.monomers[residue_id].parameterize(mk_prep, residue_id, get_atomprop_from_raw = get_atomprop_from_raw)
@staticmethod
def _build_rdkit_mol(raw_mol, template, mapping, nr_missing_H):
"""
Build an RDKit molecule from a raw molecule according to a ResidueTemplate.
Parameters
----------
raw_mol : Chem.Mol
The raw molecule matched to the template.
template : ResidueTemplate
The template to match the raw molecule to.
mapping : dict[int, int]
A dictionary mapping atom indices from the raw molecule to the template.
nr_missing_H : int
The number of missing hydrogen atoms in the raw molecule compared to the template.
Returns
-------
Chem.Mol
The RDKit molecule built from the raw molecule and template.
"""
rdkit_mol = Chem.Mol(template.mol) # making a copy
conf = Chem.Conformer(rdkit_mol.GetNumAtoms())
input_conf = raw_mol.GetConformer()
for i, j in mapping.items():
conf.SetAtomPosition(i, input_conf.GetAtomPosition(j))
rdkit_mol.AddConformer(conf, assignId=True)
if nr_missing_H: # add positions to Hs missing in raw_mol
if rdkit_mol.GetNumAtoms() != len(mapping) + nr_missing_H:
raise RuntimeError(
f"nr of atoms ({rdkit_mol.GetNumAtoms()}) != "
f"{len(mapping)=} + {nr_missing_H=}"
)
idxs = [i for i in range(rdkit_mol.GetNumAtoms()) if i not in mapping]
update_H_positions(rdkit_mol, idxs)
return rdkit_mol
@staticmethod
def _get_best_missing_Hs(results):
"""
Get the best results based on the number of missing H atoms.
Parameters
----------
results : list[dict]
A list of dictionaries containing the results of matching raw molecules to templates.
Returns
-------
best_idxs : list[int]
A list of indices of the best results based on the number of missing H atoms.
fail_log : list[list[str]]
A list of lists containing failure messages for each result.
"""
min_missing_H = 999999
best_idxs = []
fail_log = []
for i, result in enumerate(results):
fail_log.append([])
if result["heavy"]["missing"] > 0:
fail_log[-1].append("heavy missing")
if result["heavy"]["excess"] > 0:
fail_log[-1].append("heavy excess")
if len(result["H"]["excess"]) > 0:
fail_log[-1].append("H excess")
if len(result["bonds"]["excess"]) > 0:
fail_log[-1].append("bonds excess")
if len(result["bonds"]["missing"]) > 0:
fail_log[-1].append(f"bonds missing at {result['bonds']['missing']}")
if len(fail_log[-1]):
continue
if result["H"]["missing"] < min_missing_H:
best_idxs = []
min_missing_H = result["H"]["missing"]
if result["H"]["missing"] == min_missing_H:
best_idxs.append(i)
return best_idxs, fail_log
@classmethod
def _get_monomers(
cls,
raw_input_mols,
ambiguous,
residue_chem_templates,
set_template,
bonds,
blunt_ends,
):
"""
Process, parameterize and populate monomers of a Polymer from raw_input_mols and residue_chem_templates.
Parameters
----------
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names.
ambiguous : dict[str, list[str]]
Mapping between input residue names (e.g. the three-letter residue
name from PDB files) and IDs (strings) of ResidueTemplates.
residue_chem_templates : ResidueChemTemplates
An instance of the ResidueChemTemplates class to construct the polymer.
set_template : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names.
bonds : dict[tuple[str], list[tuple[int]]]
A dictionary of bonds between residues.
blunt_ends : list[tuple[str, int]], optional
A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol).
If None, no blunt ends will be added. Default is None (no blunt ends will be added).
Returns
-------
monomers : dict[str, Monomer]
A dictionary mapping residue IDs to Monomer objects.
log : dict
A dictionary containing logs of the matching process, including information about chosen templates, missing atoms, and unmatched residues.
"""
residue_templates = residue_chem_templates.residue_templates
monomers = {}
log = {
"chosen_by_fewest_missing_H": {},
"chosen_by_default": {},
"matched_with_H_anomaly": {},
"matched_with_excess_bond": [],
"no_match": [],
"no_mol": [],
"msg": "",
}
for residue_key, (raw_mol, input_resname) in raw_input_mols.items():
if raw_mol is None:
monomers[residue_key] = Monomer(
None, None, None, input_resname, None
)
log["no_mol"].append(residue_key)
logger.warning(f"molecule for {residue_key=} is None")
continue
raw_mol_has_H = sum([a.GetAtomicNum() == 1 for a in raw_mol.GetAtoms()]) > 0
excess_H_ok = False
if set_template is not None and residue_key in set_template:
excess_H_ok = True # e.g. allow set LYN (NH2) from LYS (NH3+)
template_key = set_template[residue_key] # e.g. HID, NALA
if template_key not in residue_templates:
if template_key in ambiguous:
raise RuntimeError(f"Can't assign an ambiguous tamplate_key ({template_key}) to residue ({residue_key}). ")
raise RuntimeError(f"Assigned tamplate_key ({template_key}) for residue ({residue_key}) is not in residue_templates. ")
template = residue_templates[template_key]
candidate_template_keys = [set_template[residue_key]]
candidate_templates = [template]
elif input_resname not in ambiguous:
template_key = input_resname
template = residue_templates[template_key]
candidate_template_keys = [template_key]
candidate_templates = [template]
elif len(ambiguous[input_resname]) == 1:
template_key = ambiguous[input_resname][0]
template = residue_templates[template_key]
candidate_template_keys = [template_key]
candidate_templates = [template]
else:
candidate_template_keys = []
candidate_templates = []
for key in ambiguous[input_resname]:
template = residue_templates[key]
candidate_templates.append(template)
candidate_template_keys.append(key)
# gather raw_mol atoms that have bonds or blunt ends
if blunt_ends is None:
blunt_ends = []
raw_atoms_with_bonds = []
for (r1, r2), bond_list in bonds.items():
for i, j in bond_list:
if r1 == residue_key:
raw_atoms_with_bonds.append(i)
if r2 == residue_key:
raw_atoms_with_bonds.append(j)
all_stats = {
"heavy_missing": [],
"heavy_excess": [],
"H_excess": [],
"H_missing": [],
"bonded_atoms_missing": [],
"bonded_atoms_excess": [],
}
mappings = []
for index, template in enumerate(candidate_templates):
# match intra-residue graph
match_stats, mapping = template.match(raw_mol)
mappings.append(mapping)
# match inter-residue bonds
atoms_with_bonds = set()
from_raw = {value: key for (key, value) in mapping.items()}
for raw_index in raw_atoms_with_bonds:
if raw_index in from_raw: # bonds can occur on atoms the template does not have
atom_index = from_raw[raw_index]
atoms_with_bonds.add(atom_index)
# we treat blunt ends like bonds
for res_id, atom_idx in blunt_ends:
if res_id == residue_key:
atoms_with_bonds.add(from_raw[atom_idx])
expected = set(template.link_labels)
bonded_atoms_found = atoms_with_bonds.intersection(expected)
bonded_atoms_missing = expected.difference(atoms_with_bonds)
bonded_atoms_excess = atoms_with_bonds.difference(expected)
all_stats["heavy_missing"].append(match_stats["heavy"]["missing"])
all_stats["heavy_excess"].append(match_stats["heavy"]["excess"])
all_stats["H_excess"].append(match_stats["H"]["excess"])
all_stats["H_missing"].append(match_stats["H"]["missing"])
all_stats["bonded_atoms_missing"].append(bonded_atoms_missing)
all_stats["bonded_atoms_excess"].append(bonded_atoms_excess)
passed = []
embedded_indices = [index for index, template in enumerate(candidate_templates) if len(template.link_labels) >= 2]
# 1st round
for i in embedded_indices:
if (
all_stats["heavy_missing"][i]
or all_stats["heavy_excess"][i]
or all_stats["H_excess"][i]
or all_stats["bonded_atoms_missing"][i]
or len(all_stats["bonded_atoms_excess"][i])
):
continue
passed.append(i)
# 2nd round
if len(passed) == 0:
for i in embedded_indices:
auto_blunt = set()
for j, padder_label in candidate_templates[i].link_labels.items():
if residue_chem_templates.padders[padder_label].auto_blunt:
auto_blunt.add(j)
if (
all_stats["heavy_missing"][i]
or all_stats["heavy_excess"][i]
or (not set(all_stats["H_excess"][i]) <= set(candidate_templates[i].link_labels) and not excess_H_ok)
or not all_stats["bonded_atoms_missing"][i] <= auto_blunt
):
continue
passed.append(i)
# 3rd round
if len(passed) == 0 or any(all_stats["H_excess"][i] for i in passed):
for i in range(len(candidate_templates)):
if (
all_stats["heavy_missing"][i]
or all_stats["heavy_excess"][i]
or (all_stats["H_excess"][i] and not excess_H_ok)
or len(all_stats["bonded_atoms_missing"][i])
):
continue
if i not in passed:
passed.append(i)
if len(passed) == 0:
template_key = None
template = None
mapping = None
m = f"No template matched for {residue_key=}" + eol
m += f"tried {len(candidate_templates)} templates for {residue_key=}"
m += f"{excess_H_ok=}"
m += eol
for i in range(len(all_stats["H_excess"])):
heavy_miss = all_stats["heavy_missing"][i]
heavy_excess = all_stats["heavy_excess"][i]
H_excess = all_stats["H_excess"][i]
bond_miss = all_stats["bonded_atoms_missing"][i]
bond_excess = all_stats["bonded_atoms_excess"][i]
tkey = candidate_template_keys[i]
m += (
f"{tkey:10} {heavy_miss=} {heavy_excess=} {H_excess=} {bond_miss=} {bond_excess=}"
+ eol
)
logger.warning(m)
elif len(passed) == 1 or not raw_mol_has_H:
index = passed[0]
template_key = candidate_template_keys[index]
template = candidate_templates[index]
mapping = mappings[index]
H_miss = all_stats["H_missing"][index]
else:
min_missing_H = 999999
for i, index in enumerate(passed):
H_missed = all_stats["H_missing"][index]
if H_missed < min_missing_H:
best_idxs = []
min_missing_H = H_missed
if H_missed == min_missing_H:
best_idxs.append(index)
if len(best_idxs) > 1:
number_excess_H = [len(all_stats["H_excess"][index]) for index in passed]
min_excess_H = min(number_excess_H)
best_idxs = [index for index in passed if len(all_stats["H_excess"][index]) == min_excess_H]
if len(best_idxs) > 1:
tied = " ".join(candidate_template_keys[i] for i in best_idxs)
m = f"for {residue_key=}, {len(passed)} have passed: "
tkeys = [candidate_template_keys[i] for i in passed]
m += f"{tkeys} and tied for fewest missing and excess H: {tied} "
raise RuntimeError(m)
index = best_idxs[0]
template_key = candidate_template_keys[index]
template = residue_templates[template_key]
mapping = mappings[index]
H_miss = all_stats["H_missing"][index]
log["chosen_by_fewest_missing_H"][residue_key] = template_key
H_miss = all_stats["H_missing"][index]
H_excess = all_stats["H_excess"][index]
if H_miss or H_excess:
log["matched_with_H_anomaly"][residue_key] = (
template_key,
{"H_miss": H_miss, "H_excess": len(H_excess)}
)
bond_excess = all_stats["bonded_atoms_excess"][index]
if bond_excess:
log["matched_with_excess_bond"].append(residue_key)
logger.warning(f"matched with excess inter-residue bond(s): {residue_key}")
if template is None:
rdkit_mol = None
atom_names = None
mapping = None
else:
rdkit_mol = cls._build_rdkit_mol(
raw_mol,
template,
mapping,
H_miss,
)
atom_names = template.atom_names
monomers[residue_key] = Monomer(
raw_mol,
rdkit_mol,
mapping,
input_resname,
template_key,
atom_names,
)
monomers[residue_key].template = template
if template is not None and template.link_labels is not None:
mapping_inv = monomers[
residue_key
].mapidx_from_raw # {j: i for (i, j) in mapping.items()}
# TODO check here mapping_inv unnused
link_labels = {i: label for i, label in template.link_labels.items()}
monomers[residue_key].link_labels = link_labels
return monomers, log
@staticmethod
def _build_padded_mols(monomers, bonds, padders):
"""
Build padded molecules from monomers and bonds.
Parameters
----------
monomers : dict[str, Monomer]
A dictionary mapping residue IDs to Monomer objects.
bonds : dict[tuple[str], list[tuple[int]]]
A dictionary of bonds between residues.
padders : dict[str, function]
A dictionary mapping link labels to functions that pad the molecules.
Returns
-------
padded_mols : dict[str, tuple[Chem.Mol, dict[int, int]]]
A dictionary mapping residue IDs to tuples of padded molecules and their corresponding mapping dictionaries.
"""
padded_mols = {}
bond_use_count = {key: 0 for key in bonds}
for (
residue_id,
monomer,
) in monomers.items():
if monomer.rdkit_mol is None:
continue
padded_mol = monomer.rdkit_mol
mapidx_pad = {
atom.GetIdx(): atom.GetIdx() for atom in padded_mol.GetAtoms()
}
for atom_index, link_label in monomer.link_labels.items():
adjacent_rid = None
adjacent_mol = None
adjacent_atom_index = None
for (r1_id, r2_id), bond_list in bonds.items():
# TODO the second and subsequent bonds between a pair of
# residues will not update the padding atoms with the
# positions of the adjacent residues. This is OK, the same
# happens for blunt residues, because the adjacent residue
# is missing.
i1, i2 = bond_list[0]
if r1_id == residue_id and i1 == atom_index:
adjacent_rid = r2_id
adjacent_atom_index = i2
break
elif r2_id == residue_id and i2 == atom_index:
adjacent_rid = r1_id
adjacent_atom_index = i1
break
if adjacent_rid is not None:
adjacent_mol = monomers[adjacent_rid].rdkit_mol
bond_use_count[(r1_id, r2_id)] += 1
padded_mol, mapidx = padders[link_label](
padded_mol, adjacent_mol, atom_index, adjacent_atom_index
)
tmp = {}
for i, j in enumerate(mapidx):
if j is None:
continue # new padding atom
if j not in mapidx_pad:
continue # padding atom from previous iteration for another link_label
tmp[i] = mapidx_pad[j]
mapidx_pad = tmp
# update position of hydrogens bonded to link atoms
inv = {j: i for (i, j) in mapidx_pad.items()}
padded_idxs_to_update = []
no_pad_idxs_to_update = []
for atom_index in monomer.link_labels:
heavy_atom = monomer.rdkit_mol.GetAtomWithIdx(atom_index)
for neighbor in heavy_atom.GetNeighbors():
if neighbor.GetAtomicNum() != 1:
continue
if neighbor.GetIdx() in monomer.mapidx_to_raw:
# index of H exists in mapidx_to_raw, which means that
# the raw_input_mol had the hydrogen. Thus, we do not
# want to update its coordiantes.
continue
no_pad_idxs_to_update.append(neighbor.GetIdx())
padded_idxs_to_update.append(inv[neighbor.GetIdx()])
update_H_positions(padded_mol, padded_idxs_to_update)
source = padded_mol.GetConformer()
destination = monomer.rdkit_mol.GetConformer()
for i, j in zip(no_pad_idxs_to_update, padded_idxs_to_update):
destination.SetAtomPosition(i, source.GetAtomPosition(j))
# can invert chirality in 3D positions
padded_mols[residue_id] = (padded_mol, mapidx_pad)
# verify that all bonds resulted in padding
err_msg = ""
for key, count in bond_use_count.items():
if count != 2:
err_msg += (
f"expected two paddings for {key} {bonds[key]}, padded {count}"
+ eol
)
if len(err_msg):
raise RuntimeError(err_msg)
return padded_mols
[docs]
def flexibilize_sidechain(self, residue_id, mk_prep):
"""
Set the sidechain of a residue as flexible. The Monomer must have been processed and have valid attributes before calling this method.
Parameters
----------
residue_id : str
The ID of the residue to be made flexible.
mk_prep : MoleculePreparation
An instance of the MoleculePreparation class to construct the polymer.
Returns
-------
None
"""
monomer = self.monomers[residue_id]
inv = {j: i for i, j in monomer.molsetup_mapidx.items()}
link_atoms = [inv[i] for i in monomer.template.link_labels]
if len(link_atoms) == 0:
raise RuntimeError(
"can't define a sidechain without bonds to other residues"
)
# TODO: rewrite this to work better with new MoleculeSetups
graph = {atom.index: atom.graph for atom in monomer.molsetup.atoms}
for i in range(len(link_atoms) - 1):
start_node = link_atoms[i]
end_nodes = [k for (j, k) in enumerate(link_atoms) if j != i]
backbone_paths = find_graph_paths(graph, start_node, end_nodes)
for path in backbone_paths:
for x in range(len(path) - 1):
idx1 = min(path[x], path[x + 1])
idx2 = max(path[x], path[x + 1])
monomer.molsetup.bond_info[(idx1, idx2)].rotatable = False
monomer.is_movable = True
mk_prep.calc_flex(
monomer.molsetup,
root_atom_index=link_atoms[0],
)
molsetup = monomer.molsetup
is_rigid_atom = [False for _ in molsetup.atoms]
graph = molsetup.flexibility_model["rigid_body_graph"]
root_body_idx = molsetup.flexibility_model["root"]
conn = molsetup.flexibility_model["rigid_body_connectivity"]
rigid_index_by_atom = molsetup.flexibility_model["rigid_index_by_atom"]
# from the root, use only the atom that is bonded to the only rotatable bond
for other_body_idx in graph[root_body_idx]:
root_link_atom_idx = conn[(root_body_idx, other_body_idx)][0]
for atom_idx, body_idx in rigid_index_by_atom.items():
if body_idx != root_body_idx or atom_idx == root_link_atom_idx:
monomer.is_flexres_atom[atom_idx] = True
return
@staticmethod
def _add_if_new(to_dict, key, value, repeat_log):
if key in to_dict:
repeat_log.add(key)
else:
to_dict[key] = value
return
@staticmethod
def _pdb_to_residue_mols(
pdb_string,
wanted_altloc: Optional[dict[str, str]]=None,
default_altloc: Optional[str]=None,
):
"""
Convert a PDB string to residue molecules.
Parameters
----------
pdb_string : str
The PDB string to convert.
wanted_altloc : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue.
default_altloc : str, optional
A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified.
Returns
-------
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names.
Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs.
Raises
------
ValueError
If there are interrupted residues or if each residue key does not have exactly one resname.
"""
blocks_by_residue = {}
reskey_to_resname = {}
reskey = None
buffered_reskey = None
buffered_resname = None
# residues in non-consecutive lines due to TER or another res
interrupted_residues = set()
pdb_block = []
for line in pdb_string.splitlines(True):
if line.startswith("TER") and reskey is not None:
Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues)
blocks_by_residue[reskey] = pdb_block
pdb_block = []
reskey = None
buffered_reskey = None
if line.startswith("ATOM") or line.startswith("HETATM"):
atomname = line[12:16].strip()
altloc = line[16:17].strip()
resname = line[17:20].strip()
chainid = line[21:22].strip()
resnum = int(line[22:26].strip())
icode = line[26:27].strip()
x = float(line[30:38])
y = float(line[38:46])
z = float(line[46:54])
element = line[76:78].strip()
reskey = f"{chainid}:{resnum}{icode}" # e.g. ":42", "A:42B"
reskey_to_resname.setdefault(reskey, set())
reskey_to_resname[reskey].add(resname)
atom = AtomField(
atomname, altloc, resname, chainid,
resnum, icode, x, y, z, element,
)
if reskey == buffered_reskey: # this line continues existing residue
pdb_block.append(atom)
else:
if buffered_reskey is not None:
Polymer._add_if_new(
blocks_by_residue,
buffered_reskey,
pdb_block,
interrupted_residues,
)
buffered_reskey = reskey
pdb_block = [atom]
if pdb_block: # there was not a TER line
Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues)
if interrupted_residues:
msg = f"interrupted residues in PDB: {interrupted_residues}"
raise ValueError(msg)
# verify that each identifier (e.g. "A:17" has a single resname
violations = {k: v for k, v in reskey_to_resname.items() if len(v) != 1}
if len(violations):
msg = "each residue key must have exactly 1 resname" + eol
msg += f"but got {violations=}"
raise ValueError(msg)
if wanted_altloc is None:
wanted_altloc = {}
raw_input_mols = {}
for reskey, atom_field_list in blocks_by_residue.items():
requested_altloc = wanted_altloc.get(reskey, None)
pdbmol, _, missed_altloc, needed_altloc = _aux_altloc_mol_build(
atom_field_list,
requested_altloc,
default_altloc,
)
resname = list(reskey_to_resname[reskey])[0] # verified length 1
raw_input_mols[reskey] = (pdbmol, resname, missed_altloc, needed_altloc)
return raw_input_mols
@staticmethod
def _pqr_to_residue_mols(
pqr_string
):
"""
Convert a PQR string to residue molecules.
Parameters
----------
pqr_string : str
The PQR string to convert.
Returns
-------
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names.
Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs.
Raises
------
ValueError
If there are interrupted residues or if each residue key does not have exactly one resname.
"""
blocks_by_residue = {}
blocks_qr = {}
reskey_to_resname = {}
reskey = None
buffered_reskey = None
# residues in non-consecutive lines due to TER or another res
interrupted_residues = set()
pdb_block = []
def get_pqr_atom_items(pqr_line):
"""based on pdb2pqr.structures.Atom.from_pqr_line"""
items = [w.strip() for w in pqr_line.split()]
token = items.pop(0)
if token in [
"REMARK",
"TER",
"END",
"HEADER",
"TITLE",
"COMPND",
"SOURCE",
"KEYWDS",
"EXPDTA",
"AUTHOR",
"REVDAT",
"JRNL",
]:
return None
elif token in ["ATOM", "HETATM"]:
return items
elif token[:4] == "ATOM":
return token[4:] + items
elif token[:6] == "HETATM":
return token[6:] + items
else:
err = f"Unable to parse PQR line: {pqr_line}"
raise ValueError(err)
def atom_from_pqr_items(atom_pqr_items: list[str]) -> tuple[AtomField, float]:
"""get AtomField from items read from a PQR line"""
if not atom_pqr_items:
return None
atom_serial = int(atom_pqr_items.pop(0)) # Meeko doesn't need atom_serial (ID)
atomname = atom_pqr_items.pop(0)
element = next((char for char in atomname if char.isalpha()), None)
if element is None:
err = f"Unable to parse element from PQR atomname: {atomname}"
raise ValueError(err)
element = element.upper()
altloc = "" # PQR doesn't have altloc
resname = atom_pqr_items.pop(0)
token = atom_pqr_items.pop(0)
chainid = "" # Optional in PQR
try:
resnum = int(token) # Must be int in PQR
except ValueError:
chainid = token
resnum = int(atom_pqr_items.pop(0))
token = atom_pqr_items.pop(0)
icode = "" # Optional in PQR
try:
x = float(token)
except ValueError:
icode = token
x = float(atom_pqr_items.pop(0))
y = float(atom_pqr_items.pop(0))
z = float(atom_pqr_items.pop(0))
charge = float(atom_pqr_items.pop(0))
radius = float(atom_pqr_items.pop(0))
return (
AtomField(
atomname, altloc, resname, chainid,
resnum, icode, x, y, z, element,
),
charge, radius,
)
# region adapted from _pdb_to_residue_mols
for line in pqr_string.splitlines(True):
pqr_items = get_pqr_atom_items(line)
if pqr_items is None and reskey is not None:
Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues)
blocks_by_residue[reskey] = pdb_block
blocks_qr[reskey] = block_qr
pdb_block = []
block_qr = []
reskey = None
buffered_reskey = None
if pqr_items:
atom, pqr_charge, pqr_radius = atom_from_pqr_items(pqr_items)
reskey = f"{atom.chain}:{atom.resnum}{atom.icode}"
resname = atom.resname
reskey_to_resname.setdefault(reskey, set())
reskey_to_resname[reskey].add(resname)
if reskey == buffered_reskey: # this line continues existing residue
pdb_block.append(atom)
block_qr.append((pqr_charge, pqr_radius))
else:
if buffered_reskey is not None:
Polymer._add_if_new(
blocks_by_residue,
buffered_reskey,
pdb_block,
interrupted_residues,
)
blocks_qr[buffered_reskey] = block_qr
buffered_reskey = reskey
pdb_block = [atom]
block_qr = [(pqr_charge, pqr_radius)]
if pdb_block: # there was not a TER line
Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues)
blocks_qr[reskey] = block_qr
if interrupted_residues:
msg = f"interrupted residues in PDB: {interrupted_residues}"
raise ValueError(msg)
# verify that each identifier (e.g. "A:17" has a single resname
violations = {k: v for k, v in reskey_to_resname.items() if len(v) != 1}
if len(violations):
msg = "each residue key must have exactly 1 resname" + eol
msg += f"but got {violations=}"
raise ValueError(msg)
# endregion
raw_input_mols = {}
# PQR shouldn't have altlocs
requested_altloc = None
default_altloc = ""
for reskey, atom_field_list in blocks_by_residue.items():
requested_altloc = None
pdbmol, _, missed_altloc, needed_altloc = _aux_altloc_mol_build(
atom_field_list,
requested_altloc,
default_altloc,
)
for atom, pqr_prop in zip(pdbmol.GetAtoms(), blocks_qr[reskey]):
atom.SetDoubleProp("PQRCharge", pqr_prop[0])
atom.SetDoubleProp("PQRRadius", pqr_prop[1])
resname = list(reskey_to_resname[reskey])[0] # verified length 1
raw_input_mols[reskey] = (pdbmol, resname, missed_altloc, needed_altloc)
return raw_input_mols
@staticmethod
def _prody_to_residue_mols(
prody_obj: ALLOWED_PRODY_TYPES,
wanted_altloc_dict: Optional[dict] = None,
default_altloc: Optional[str] = None,
) -> dict:
"""
Convert a ProDy object to residue molecules.
Parameters
----------
prody_obj : ALLOWED_PRODY_TYPES
The ProDy object to convert (can be one of the allowed types in .utils.prodyutils.ALLOWED_PRODY_TYPES).
wanted_altloc_dict : dict[str, str], optional
A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue.
default_altloc : str, optional
A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified.
Returns
-------
raw_input_mols : dict[str, tuple[Chem.Mol, str]]
A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names.
Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs.
"""
if wanted_altloc_dict is None:
wanted_altloc_dict = {}
raw_input_mols = {}
reskey_to_resname = {}
# generate macromolecule hierarchy iterator
hierarchy = prody_obj.getHierView()
# iterate chains
for chain in hierarchy.iterChains():
# iterate residues
for res in chain.iterResidues():
# gather residue info
chain_id = str(res.getChid()).strip()
res_name = str(res.getResname()).strip()
res_num = int(res.getResnum())
icode = str(res.getIcode()).strip()
reskey = f"{chain_id}:{res_num}{icode}"
reskey_to_resname.setdefault(reskey, set())
reskey_to_resname[reskey].add(res_name)
requested_altloc = wanted_altloc_dict.get(reskey, None)
# we are not sanitizing because protonated LYS don't have the
# formal charge set on the N and Chem.SanitizeMol raises error
# Chem.SanitizeMol(prody_mol)
prody_mol, missed_altloc, needed_altloc = prody_to_rdkit(
res,
sanitize=False,
requested_altloc=requested_altloc,
default_altloc=default_altloc,
)
raw_input_mols[reskey] = (prody_mol, res_name,
missed_altloc, needed_altloc)
return raw_input_mols
[docs]
def to_pdb(self, new_positions: Optional[dict]=None):
"""
Convert the polymer to a PDB string while (optionally) updating the coordinates of specified monomers.
Parameters
----------
new_positions: dict[str, dict[int, tuple[float, float, float]]], optional
A dictionary mapping residue IDs to dictionaries of atom indices and their new coordinates.
Returns
-------
pdb_string: str
The PDB string representation of the polymer.
Raises
------
ValueError
If any residue IDs in new_positions are not valid monomers.
"""
if new_positions is None:
new_positions = {}
valid_monomers = self.get_valid_monomers()
# check that residue IDs passed in new_positions are valid
unknown_res_ids = set()
for res_id in new_positions:
if res_id not in valid_monomers:
unknown_res_ids.add(res_id)
if unknown_res_ids:
msg = f"Residue IDs not in valid monomers: {unknown_res_ids}"
raise ValueError(msg)
pdbout = ""
atom_count = 0
pdb_line = "{:6s}{:5d} {:^4s} {:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f} {:>2s} "
pdb_line += eol
for res_id in self.get_valid_monomers():
rdkit_mol = self.monomers[res_id].rdkit_mol
if res_id in new_positions:
positions = get_updated_positions(
self.monomers[res_id],
new_positions[res_id],
)
else:
positions = rdkit_mol.GetConformer().GetPositions()
chain, resnum = res_id.split(":")
if resnum[-1].isalpha():
icode = resnum[-1]
resnum = resnum[:-1]
else:
icode = ""
resnum = int(resnum)
for i, atom in enumerate(rdkit_mol.GetAtoms()):
atom_count += 1
props = atom.GetPropsAsDict()
atom_name = self.monomers[res_id].atom_names[i]
x, y, z = positions[i]
element = mini_periodic_table[atom.GetAtomicNum()]
pdbout += pdb_line.format(
"ATOM",
atom_count,
atom_name,
self.monomers[res_id].input_resname,
chain,
resnum,
icode,
x,
y,
z,
element,
)
return pdbout
[docs]
def export_static_atom_params(self):
"""
Export static atom parameters from the polymer.
Returns
-------
atom_params: dict
A dictionary containing atom parameters.
coords: list
A list of coordinates for the atoms.
"""
atom_params = {}
counter_atoms = 0
coords = []
dedicated_attribute = (
"charge",
"atom_type",
) # molsetup has a dedicated attribute
for res_id in self.get_valid_monomers():
molsetup = self.monomers[res_id].molsetup
wanted_atom_indices = []
for atom in molsetup.atoms:
if not atom.is_ignore and not self.monomers[res_id].is_flexres_atom[atom.index]:
wanted_atom_indices.append(atom.index)
coords.append(molsetup.get_coord(atom.index))
for key, values in molsetup.atom_params.items():
atom_params.setdefault(key, [None] * counter_atoms) # add new "column"
for i in wanted_atom_indices:
atom_params[key].append(values[i])
# This was reworked to specifically address the new MoleculeSetup structure. Needs re-thinking
charge_dict = {atom.index: atom.charge for atom in molsetup.atoms}
atom_type_dict = {atom.index: atom.atom_type for atom in molsetup.atoms}
for key in dedicated_attribute:
atom_params.setdefault(key, [None] * counter_atoms) # add new "column"
if key == "charge":
values_dict = charge_dict
else:
values_dict = atom_type_dict
for i in wanted_atom_indices:
atom_params[key].append(values_dict[i])
counter_atoms += len(wanted_atom_indices)
added_keys = set(molsetup.atom_params).union(dedicated_attribute)
for key in set(atom_params).difference(
added_keys
): # <key> missing in current molsetup
atom_params[key].extend(
[None] * len(wanted_atom_indices)
) # fill in incomplete "row"
if hasattr(self, "param_rename"): # e.g. "gasteiger" -> "q"
for key, new_key in self.param_rename.items():
atom_params[new_key] = atom_params.pop(key)
return atom_params, coords
# region Filtering Residues
[docs]
def get_ignored_monomers(self):
"""
Get monomers that are ignored.
Returns
-------
dict[str, Monomer]
A dictionary of ignored monomers. The keys are the residue IDs and the values are the corresponding Monomer objects.
"""
return {k: v for k, v in self.monomers.items() if v.rdkit_mol is None}
[docs]
def get_valid_monomers(self):
"""
Get monomers that are valid (not ignored).
Returns
-------
dict[str, Monomer]
A dictionary of valid monomers. The keys are the residue IDs and the values are the corresponding Monomer objects.
"""
return {k: v for k, v in self.monomers.items() if v.rdkit_mol is not None}
# endregion
[docs]
def add_rotamers_to_polymer_molsetups(rotamer_states_list, polymer):
"""
Add rotamer states to the monomers' molecule setups in a polymer and get the state indices.
Parameters
----------
rotamer_states_list : list[dict[str, list[float]]]
A list of dictionaries, where each dictionary contains residue IDs as keys and lists of angles as values.
polymer : Polymer
The polymer object to which the rotamer states will be added.
Returns
-------
state_indices_list : list[dict[str, int]]
A list of dictionaries, where each dictionary contains residue IDs as keys and the corresponding rotamer state indices as values.
"""
rotamer_res_disambiguate = {}
for (
primary_res,
specific_res_list,
) in polymer.residue_chem_templates.ambiguous.items():
for specific_res in specific_res_list:
rotamer_res_disambiguate[specific_res] = primary_res
no_resname_to_resname = {}
for res_with_resname in polymer.monomers:
chain, resname, resnum = res_with_resname.split(":")
no_resname_key = f"{chain}:{resnum}"
if no_resname_key in no_resname_to_resname:
errmsg = "both %s and %s would be keyed by %s" % (
res_with_resname,
no_resname_to_resname[no_resname_key],
no_resname_key,
)
raise RuntimeError(errmsg)
no_resname_to_resname[no_resname_key] = res_with_resname
state_indices_list = []
for state_index, state_dict in enumerate(rotamer_states_list):
logger.info(f"adding rotamer state {state_index + 1}")
state_indices = {}
for res_no_resname, angles in state_dict.items():
res_with_resname = no_resname_to_resname[res_no_resname]
if polymer.monomers[res_with_resname].molsetup is None:
raise RuntimeError(
"no molsetup for %s, can't add rotamers" % (res_with_resname)
)
# next block is inefficient for large rotamer_states_list
# refactored polymers could help by having the following
# data readily available
molsetup = polymer.monomers[res_with_resname].molsetup
name_to_molsetup_idx = {}
for atom in molsetup.atoms:
atom_name = atom.pdbinfo.name
name_to_molsetup_idx[atom_name] = atom.index
resname = res_with_resname.split(":")[1]
resname = rotamer_res_disambiguate.get(resname, resname)
atom_names = residues_rotamers[resname]
if len(atom_names) != len(angles):
raise RuntimeError(
f"expected {len(atom_names)} angles for {resname}, got {len(angles)}"
)
atom_idxs = []
for names in atom_names:
tmp = [name_to_molsetup_idx[name] for name in names]
atom_idxs.append(tmp)
state_indices[res_with_resname] = len(molsetup.rotamers)
molsetup.add_rotamer(atom_idxs, np.radians(angles))
state_indices_list.append(state_indices)
return state_indices_list
[docs]
class Monomer(BaseJSONParsable):
"""
Individual subunit in a Polymer. Often called residue.
Attributes
----------
raw_rdkit_mol : Chem.Mol
An RDKit Mol that defines element and connectivity within a residue. Bond orders and
formal charges may be incorrect, and hydrogens may be missing.
This molecule may originate from a PDB string and it defines also
the positions of the atoms.
rdkit_mol : Chem.Mol
Copy of the molecule from a ResidueTemplate, with positions from
raw_rdkit_mol. All hydrogens are real atoms except for those
at connections with adjacent residues.
mapidx_to_raw : dict[int, int]
Mapping of atom indices in rdkit_mol to raw_rdkit_mol.
residue_template_key : str
The matched residue template key of this Monomer.
input_resname : str
The input residue name of this Monomer.
atom_names : list[str]
List of atom names in the same order as rdkit_mol.
padded_mol : Chem.Mol
Padded molecule with additional atoms around link atoms from adjacent residues.
molsetup : RDKitMoleculeSetup
An RDKitMoleculeSetup associated with this residue.
molsetup_mapidx : dict[int, int]
Mapping of atom indices in padded_mol to rdkit_mol.
is_flexres_atom : list[bool]
List indicating whether each atom is a flexible residue atom.
is_movable : bool
Indicates whether the residue is movable.
mapidx_from_raw : dict[int, int]
Mapping of atom indices in raw_rdkit_mol to rdkit_mol.
template : ResidueTemplate
provides access to link_labels in the template
"""
def __init__(
self,
raw_input_mol,
rdkit_mol,
mapidx_to_raw,
input_resname=None,
template_key=None,
atom_names=None,
):
"""
Initialize a Monomer instance.
Parameters
----------
raw_input_mol : Chem.Mol
An RDKit Mol that defines element and connectivity within a residue. Bond orders and
formal charges may be incorrect, and hydrogens may be missing.
rdkit_mol : Chem.Mol
Copy of the molecule from a ResidueTemplate, with positions from
raw_rdkit_mol. All hydrogens are real atoms except for those
at connections with adjacent residues.
mapidx_to_raw : dict[int, int]
Mapping of atom indices in rdkit_mol to raw_rdkit_mol.
input_resname : str, optional
The input residue name of this Monomer.
template_key : str, optional
The matched residue template key of this Monomer.
atom_names : list[str], optional
List of atom names in the same order as rdkit_mol.
"""
# Initializer attributes
self.raw_rdkit_mol = raw_input_mol
self.rdkit_mol = rdkit_mol
self.mapidx_to_raw = mapidx_to_raw
self.residue_template_key = template_key
self.input_resname = input_resname # exists even in openmm topology
self.atom_names = (
atom_names # same order as atoms in rdkit_mol, used in rotamers
)
# (JSON-bound) computed attributes
self.padded_mol = None
self.molsetup = None
self.molsetup_mapidx = None
self.is_flexres_atom = None # Check about these data types/Do we want the default to be None or empty
self.is_movable = False
self.mapidx_from_raw = self._invert_mapping(self.mapidx_to_raw)
# (JSON-unbound) computed attributes
# TODO convert link indices/labels in template to rdkit_mol indices herein
# self.link_labels = {}
self.template = None
@staticmethod
def _invert_mapping(mapping):
if mapping is None:
return None
inverted = {}
for key, value in mapping.items():
if value in inverted:
raise RuntimeError(f"Mapping is not invertible: {mapping}")
inverted[value] = key
return inverted
# region JSON-interchange functions
[docs]
@classmethod
def json_encoder(cls, obj: "Monomer") -> Optional[dict[str, Any]]:
try:
molsetup = serialize_optional(RDKitMoleculeSetup.json_encoder, obj.molsetup)
except KeyError:
molsetup = serialize_optional(MoleculeSetup.json_encoder, obj.molsetup)
return {
"raw_rdkit_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.raw_rdkit_mol),
"rdkit_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.rdkit_mol),
"mapidx_to_raw": obj.mapidx_to_raw,
"residue_template_key": obj.residue_template_key,
"input_resname": obj.input_resname,
"atom_name": obj.atom_names,
"mapidx_from_raw": obj.mapidx_from_raw,
"padded_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.padded_mol),
"molsetup": molsetup,
"is_flexres_atom": obj.is_flexres_atom,
"is_movable": obj.is_movable,
"molsetup_mapidx": obj.molsetup_mapidx,
}
# Keys to check for deserialized JSON
expected_json_keys = frozenset({
"raw_rdkit_mol",
"rdkit_mol",
"mapidx_to_raw",
"residue_template_key",
"input_resname",
"atom_name",
"padded_mol",
"molsetup",
"molsetup_mapidx",
"is_flexres_atom",
"is_movable",
"mapidx_from_raw",
})
@classmethod
def _decode_object(cls, obj: dict[str, Any]):
raw_rdkit_mol = rdkit_mol_from_json(obj["raw_rdkit_mol"])
rdkit_mol = rdkit_mol_from_json(obj["rdkit_mol"])
padded_mol = rdkit_mol_from_json(obj["padded_mol"])
molsetup = RDKitMoleculeSetup.from_dict(obj["molsetup"])
if not isinstance(molsetup, RDKitMoleculeSetup):
molsetup = MoleculeSetup.from_dict(obj["molsetup"])
mapidx_to_raw = convert_to_int_keyed_dict(obj["mapidx_to_raw"])
molsetup_mapidx = convert_to_int_keyed_dict(obj["molsetup_mapidx"])
mapidx_from_raw = convert_to_int_keyed_dict(obj["mapidx_from_raw"])
atom_name = cls.access_with_deprecated_key(
obj, old_key="atom_names", new_key="atom_name"
)
monomer = cls(
raw_input_mol=raw_rdkit_mol,
rdkit_mol=rdkit_mol,
mapidx_to_raw=mapidx_to_raw,
input_resname=obj["input_resname"],
template_key=obj["residue_template_key"],
atom_names=atom_name,
)
monomer.padded_mol=padded_mol
monomer.molsetup=molsetup
monomer.molsetup_mapidx=molsetup_mapidx
monomer.is_flexres_atom=obj["is_flexres_atom"]
monomer.is_movable=obj["is_movable"]
monomer.mapidx_from_raw = mapidx_from_raw
return monomer
# endregion
[docs]
def set_atom_names(self, atom_names_list):
"""
Set the atom names for the monomer.
Parameters
----------
atom_names_list : list[str]
A list of atom names to set for the monomer.
The length of this list must match the number of atoms in the RDKit molecule.
Returns
-------
None
"""
if self.rdkit_mol is None:
raise RuntimeError("can't set atom_names if rdkit_mol is not set yet")
if len(atom_names_list) != self.rdkit_mol.GetNumAtoms():
raise ValueError(
f"{len(atom_names_list)=} differs from {self.rdkit_mol.GetNumAtoms()=}"
)
name_types = set([type(name) for name in atom_names_list])
if name_types != {str}:
raise ValueError(f"atom names must be str but {name_types=}")
self.atom_names = atom_names_list
return
[docs]
def parameterize(self, mk_prep, residue_id, get_atomprop_from_raw: dict = None):
"""
Parameterize the monomer using the provided mk_prep object.
Parameters
----------
mk_prep : MoleculePreparation
A MoleculePreparation object that provides the parameterization method.
residue_id : str
The residue ID to be used for parameterization.
get_atomprop_from_raw : dict[str, any], optional
A dictionary mapping atom property names to default values.
If provided, these properties will be set on the atoms in the raw RDKit molecule.
The default is None (not passing any properties).
Raises
------
ValueError
If the atom property names in get_atomprop_from_raw are not strings.
NotImplementedError
If the number of molsetups is not equal to 1.
"""
if get_atomprop_from_raw:
if any(not isinstance(prop_name, str) for prop_name in get_atomprop_from_raw.keys()):
raise ValueError(f"Atom property name must be str. Got {prop_name} ({type(prop_name)}) instead! ")
raw_mol = self.raw_rdkit_mol
atoms_in_raw_mol = [atom for atom in raw_mol.GetAtoms()]
mapidx_to_raw = self.mapidx_to_raw
molsetup_mapidx = self.molsetup_mapidx
for atom in self.padded_mol.GetAtoms():
atom_idx_in_raw = mapidx_to_raw.get(molsetup_mapidx.get(atom.GetIdx(), None), None)
for prop_name, default_value in get_atomprop_from_raw.items():
if atom_idx_in_raw is not None:
prop_value = atoms_in_raw_mol[atom_idx_in_raw].GetProp(prop_name)
else:
prop_value = str(default_value)
atom.SetProp(prop_name, prop_value)
molsetups = mk_prep(self.padded_mol)
if len(molsetups) != 1:
raise NotImplementedError(f"need 1 molsetup but got {len(molsetups)}")
molsetup = molsetups[0]
self.molsetup = molsetup
self.is_flexres_atom = [False for _ in molsetup.atoms]
# set ignore to True for atoms that are padding
for atom in molsetup.atoms:
if atom.index not in self.molsetup_mapidx:
atom.is_ignore = True
# recalculate flexibility tree after setting ignored atoms
mk_prep.calc_flex(molsetup)
# rectify charges to sum to integer (because of padding)
if mk_prep.charge_model == "zero":
net_charge = 0
else:
rdkit_mol = self.rdkit_mol
net_charge = sum(
[atom.GetFormalCharge() for atom in rdkit_mol.GetAtoms()]
)
not_ignored_idxs = []
charges = []
for atom in molsetup.atoms:
if atom.index in self.molsetup_mapidx: # TODO offsite not in mapidx
charges.append(atom.charge)
not_ignored_idxs.append(atom.index)
charges = rectify_charges(charges, net_charge, decimals=3)
for i, j in enumerate(not_ignored_idxs):
molsetup.atoms[j].charge = charges[i]
self._set_pdbinfo(residue_id)
return
def _set_pdbinfo(self, residue_id):
not_ignored_idxs = []
for atom in self.molsetup.atoms:
if atom.index in self.molsetup_mapidx: # TODO offsite not in mapidx
not_ignored_idxs.append(atom.index)
chain, resnum = residue_id.split(":")
if resnum[-1].isalpha():
icode = resnum[-1]
resnum = resnum[:-1]
else:
icode = ""
if self.atom_names is None:
atom_names = ["" for _ in not_ignored_idxs]
else:
atom_names = self.atom_names
for i, j in enumerate(not_ignored_idxs):
atom_name = atom_names[self.molsetup_mapidx[j]]
self.molsetup.atoms[j].pdbinfo = PDBAtomInfo(
atom_name, self.input_resname, int(resnum), icode, chain
)
return
[docs]
class NoAtomMapWarning(logging.Filter):
"""
A logging filter to exclude warnings about missing atom mapping numbers in RDKit.
"""
[docs]
def filter(self, record):
"""
Filter out warnings about missing atom mapping numbers in RDKit.
Parameters
----------
record : logging.LogRecord
The log record to be filtered.
Returns
-------
bool
True if the record should be logged, False otherwise.
"""
fields = record.getMessage().split()
a = " ".join(fields[1:4]) == "product atom-mapping number"
b = " ".join(fields[5:]) == "not found in reactants."
is_atom_map_warning = a and b
return not is_atom_map_warning
[docs]
class ResiduePadder(BaseJSONParsable):
"""
A class for padding RDKit molecules of residues with parts from adjacent residues.
Attributes
----------
rxn : rdChemReactions.ChemicalReaction
Reaction SMARTS of a single-reactant, single-product reaction for padding.
adjacent_smartsmol : Chem.Mol
SMARTS molecule with mapping numbers to copy atom positions from part of adjacent residue.
adjacent_smartsmol_mapidx : list
Mapping for atoms in adjacent_smartsmol, from mapping numbers to atom indicies.
"""
# Replacing ResidueConnection by ResiduePadding
# Why have two ResiduePadding instances per connection between two-residues?
# - three-way merge: if three carbons joined in cyclopropare, we can still pad
# - defines padding in the reaction for blunt residues
# - all bonds will be defined in the input topology after a future refactor
# reaction should not delete atoms, not even Hs
# reaction should create bonds at non-real Hs (implicit or explicit rdktt H)
def __init__(self, rxn_smarts: str, adjacent_res_smarts: str = None, auto_blunt:bool=False):
"""
Initialize the ResiduePadder with reaction SMARTS and optional adjacent residue SMARTS.
Parameters
----------
rxn_smarts: str
Reaction SMARTS to pad a link atom of a Monomer molecule.
Product atoms that are not mapped in the reactants will have
their coordinates set from an adjacent residue molecule, given
that adjacent_res_smarts is provided and the atom labels match
the unmapped product atoms of rxn_smarts.
adjacent_res_smarts: str
SMARTS pattern to identify atoms in molecule of adjacent residue
and copy their positions to padding atoms. The SMARTS atom labels
must match those of the product atoms of rxn_smarts that are
unmapped in the reagents.
auto_blunt: bool
missing bonds of Monomers will automatically be blunt if
this parameter is true, and raise an error otherwise
"""
# Ensure rxn_smarts has single reactant and single product
self.rxn = self._validate_rxn_smarts(rxn_smarts)
self.auto_blunt = auto_blunt
# Fill in adjacent_smartsmol_mapidx
if adjacent_res_smarts is None:
self.adjacent_smartsmol = None
self.adjacent_smartsmol_mapidx = None
return
# Ensure adjacent_res_smarts is None or a valid SMARTS
self.adjacent_smartsmol = self._initialize_adj_smartsmol(adjacent_res_smarts)
# Ensure the mapping numbers are the same in adjacent_smartsmol and rxn_smarts's product
self._check_adj_smarts(self.rxn, self.adjacent_smartsmol)
self.adjacent_smartsmol_mapidx = {
atom.GetIntProp("molAtomMapNumber"): atom.GetIdx()
for atom in self.adjacent_smartsmol.GetAtoms()
if atom.HasProp("molAtomMapNumber")
}
return
@staticmethod
def _validate_rxn_smarts(rxn_smarts: str) -> rdChemReactions.ChemicalReaction:
"""Validate rxn_smarts and return rxn"""
rxn = rdChemReactions.ReactionFromSmarts(rxn_smarts)
if rxn.GetNumReactantTemplates() != 1:
raise ValueError(f"Expected 1 reactant, got {rxn.GetNumReactantTemplates()}.")
if rxn.GetNumProductTemplates() != 1:
raise ValueError(f"Expected 1 product, got {rxn.GetNumProductTemplates()}.")
return rxn
@staticmethod
def _initialize_adj_smartsmol(adjacent_res_smarts: str) -> Chem.Mol:
"""Validate adjacent_res_smarts and return adjacent_smartsmol"""
adjacent_smartsmol = Chem.MolFromSmarts(adjacent_res_smarts)
if adjacent_smartsmol is None:
raise RuntimeError("Invalid SMARTS pattern in adjacent_res_smarts")
return adjacent_smartsmol
@staticmethod
def _check_adj_smarts(rxn: rdChemReactions.ChemicalReaction, adjacent_smartsmol: Chem.Mol):
"""Ensure the atom mapping numbers are the same in adjacent_smartsmol and rxn_smarts's product"""
# Assumes single reactant, single product
reactant_ids = get_molAtomMapNumbers(rxn.GetReactantTemplate(0))
product_ids = get_molAtomMapNumbers(rxn.GetProductTemplate(0))
adjacent_ids = get_molAtomMapNumbers(adjacent_smartsmol)
padding_ids = product_ids.difference(reactant_ids)
is_ok = padding_ids == adjacent_ids
if not is_ok:
raise ValueError(f"SMARTS labels in adjacent_smartsmol ({adjacent_ids}) differ from \
unmapped product labels in reaction ({padding_ids})")
def __call__(self, target_mol: Chem.Mol, adjacent_mol = None,
target_required_atom_index = None, adjacent_required_atom_index = None):
"""
Pad the target_mol with atoms from the adjacent_mol using the defined reaction.
The target_mol must contain the reactant of the reaction, and the adjacent_mol
must contain the expected adjacent_smartsmol. The function returns the padded
molecule and the mapping index of the adjacent_smartsmol.
Parameters
----------
target_mol : Chem.Mol
The target molecule to be padded. It must contain the reactant of the reaction.
adjacent_mol : Chem.Mol, optional
The adjacent molecule from which atoms will be copied. It must contain the expected adjacent_smartsmol.
target_required_atom_index : int, optional
The atom index in the target_mol that must be present in the product.
adjacent_required_atom_index : int, optional
The atom index in the adjacent_mol that must be present in the product.
Returns
-------
padded_mol : Chem.Mol
The padded molecule with additional atoms from the adjacent_mol.
adjacent_smartsmol_mapidx : dict[int, int]
Mapping of atom indices in the adjacent_smartsmol to the target_mol.
Raises
RuntimeError
If the target_mol does not contain the reactant of the reaction or if the adjacent_mol
does not contain the expected adjacent_smartsmol.
ValueError
If the target_mol does not contain the expected atom index or if the adjacent_mol
does not contain the expected atom index.
NotImplementedError
If the reaction does not produce a single product or if the adjacent_mol does not
contain the expected adjacent_smartsmol.
"""
# add Hs only to padding atoms
# copy coordinates if adjacent res has Hs bound to heavy atoms
# labels have been checked upstream
# Ensure target_mol contains self.rxn's reactant
rxn = self.rxn
if not self._check_target_mol(target_mol):
logger.info(f"target_mol ({Chem.MolToSmiles(target_mol)}) is not fully compliant with the template rxn ({rdChemReactions.ReactionToSmarts(self.rxn)})...")
# Assumes single reactant and single product
reactant_smartsmol = rxn.GetReactantTemplate(0)
reactant_ids = get_molAtomMapNumbers(reactant_smartsmol)
# Generate fallback options for reactants
fallback_reactant_smartsmol = Chem.MolFromSmarts(rdFMCS.FindMCS([reactant_smartsmol, target_mol]).smartsString)
if fallback_reactant_smartsmol is None:
raise RuntimeError(f"There is no common substructure between target_mol and the expected reactant. ")
# Add mapping number to fallback reactants and filter the fallback options
# To be accepted, the fallback reactant needs to at least have a match with target_mol
# containing target_mol's atom with target_required_atom_index
fallback_reactants = [
reactant_mol for reactant_mol in apply_atom_mappings(fallback_reactant_smartsmol, reactant_smartsmol)
if any(target_required_atom_index in match for match in target_mol.GetSubstructMatches(reactant_mol))
]
if len(fallback_reactants) == 0:
raise RuntimeError(f"The maximum common substructure between target_mol and the expected reactant does not contain the expected linker atom with target_required_atom_index.")
# Take any fallback reactant; actually, they're the same reactant mols having different mapping numbers
fallback_reactant = fallback_reactants[0]
# Modify rxn smarts and update rxn
fallback_reactant_ids = get_molAtomMapNumbers(fallback_reactant)
skipping_ids = reactant_ids.difference(fallback_reactant_ids)
fallback_product = remove_atoms_with_mapping(rxn.GetProductTemplate(0), skipping_ids)
fallback_rxnsmarts = f"{Chem.MolToSmarts(fallback_reactant)}>>{Chem.MolToSmarts(fallback_product)}"
rxn = rdChemReactions.ReactionFromSmarts(fallback_rxnsmarts)
logger.info(f"Switched from Template rxn ({rdChemReactions.ReactionToSmarts(self.rxn)}) to Fallback rxn ({fallback_rxnsmarts})")
# Get adjacent_mol's reacting part that contains adjacent_required_atom_index
if adjacent_mol is not None:
# Ensure adjacent_mol contains expected_adjacent_smartsmol, and
# there's exactly one match that includes atom with adjacent_required_atom_index
if self._check_adjacent_mol(self.adjacent_smartsmol, adjacent_mol, adjacent_required_atom_index):
adjacent_smartsmol = self.adjacent_smartsmol
# Remove unmapped atoms from Template adjacent mol SMARTS as the fallback option;
# The unmapped atoms aren't needed for positions anyways
else:
logger.info(f"adjacent_mol ({Chem.MolToSmiles(adjacent_mol)}) is not fully compliant with the template adjacent_smarts ({Chem.MolToSmarts(self.adjacent_smartsmol)})...")
adjacent_smartsmol = remove_unmapped_atoms_from_mol(self.adjacent_smartsmol)
# Evaluate adjacent mol against the fallback adjacent mol SMARTS
if self._check_adjacent_mol(adjacent_smartsmol, adjacent_mol, adjacent_required_atom_index):
logger.info(f"Switched from Template adjacent mol ({Chem.MolToSmarts(self.adjacent_smartsmol)}) to Fallback adjacent mol ({Chem.MolToSmarts(adjacent_smartsmol)})")
else:
raise RuntimeError(f"adjacent_mol doesn't contain the mapped atoms in adjacent_smartsmol.")
# Update hit and adjacent_smartsmol_mapidx
hit = adjacent_mol.GetSubstructMatches(adjacent_smartsmol)[0]
adjacent_smartsmol_mapidx = {
atom.GetIntProp("molAtomMapNumber"): atom.GetIdx()
for atom in adjacent_smartsmol.GetAtoms() if atom.HasProp("molAtomMapNumber")
}
# suppress rdkit warning about product atom map not found in reactants
# e.g. in "[C:1]>>[C:1][O:2]" label :2 is missing in reactants
filtr = NoAtomMapWarning()
rdkit_logger.addFilter(filtr)
# Get padded mol and index map from the rxn
outcomes = react_and_map((target_mol,), rxn)
rdkit_logger.removeFilter(filtr)
# Filter outcomes by target_required_atom_index
if target_required_atom_index is not None:
outcomes = [
(product, index_map)
for (product, index_map) in outcomes
if target_required_atom_index in index_map["atom_idx"]
]
# Ensure single outcome
if len(outcomes) == 0:
raise RuntimeError(f"The padding reaction of target_mol has no outcome that contains the atom with target_required_atom_index")
elif len(outcomes) > 1:
raise RuntimeError(f"The padding reaction of target_mol has multiple outcomes that contain the atom with target_required_atom_index")
padded_mol, idxmap = outcomes[0]
padding_heavy_atoms = [
i for i, j in enumerate(idxmap["atom_idx"])
if j is None and padded_mol.GetAtomWithIdx(i).GetAtomicNum() != 1
]
mapidx = idxmap["atom_idx"]
# Add Hs to padded_mol and update mapidx
if adjacent_mol is None:
padded_mol.UpdatePropertyCache() # avoids getNumImplicitHs() called without preceding call to calcImplicitValence()
Chem.SanitizeMol(padded_mol) # just in case
padded_h = Chem.AddHs(padded_mol, onlyOnAtoms=padding_heavy_atoms)
mapidx += [None] * (padded_h.GetNumAtoms() - padded_mol.GetNumAtoms())
else:
# Get coordinates of existing atoms
adjacent_coords = adjacent_mol.GetConformer().GetPositions()
for atom in adjacent_smartsmol.GetAtoms():
if not atom.HasProp("molAtomMapNumber"):
continue
j = atom.GetIntProp("molAtomMapNumber")
k = idxmap["new_atom_label"].index(j)
l = adjacent_smartsmol_mapidx[j]
padded_mol.GetConformer().SetAtomPosition(k, adjacent_coords[hit[l]])
padded_mol.UpdatePropertyCache() # avoids getNumImplicitHs() called without preceding call to calcImplicitValence()
Chem.SanitizeMol(padded_mol) # got crooked Hs without this
padded_h = Chem.AddHs(
padded_mol, onlyOnAtoms=padding_heavy_atoms, addCoords=True
)
return padded_h, mapidx
@staticmethod
def _check_adjacent_mol(expected_adjacent_smartsmol: Chem.Mol, adjacent_mol: Chem.Mol, adjacent_required_atom_index: str):
"""
Ensure adjacent_mol contains expected_adjacent_smartsmol, and
there's exactly one match that includes atom with adjacent_required_atom_index.
"""
if expected_adjacent_smartsmol is None:
raise RuntimeError("adjacent_res_smarts must be initialized to support adjacent_mol.")
hits = adjacent_mol.GetSubstructMatches(expected_adjacent_smartsmol)
if adjacent_required_atom_index is not None:
hits = [hit for hit in hits if adjacent_required_atom_index in hit]
if len(hits) > 1:
raise RuntimeError(f"adjacent_mol has multiple matches for adjacent_smartsmol.")
elif len(hits) == 0:
return False
return True
def _check_target_mol(self, target_mol: Chem.Mol):
"""Ensure target_mol contains self.rxn's reactant."""
# Assumes single reactant
if target_mol.GetSubstructMatches(self.rxn.GetReactantTemplate(0)):
return True
else:
return False
# region JSON-interchange functions
[docs]
@classmethod
def json_encoder(cls, obj: "ResiduePadder") -> Optional[dict[str, Any]]:
output_dict = {
"rxn_smarts": rdChemReactions.ReactionToSmarts(obj.rxn),
"adjacent_res_smarts": serialize_optional(Chem.MolToSmarts, obj.adjacent_smartsmol),
"auto_blunt": obj.auto_blunt,
}
# we are not serializing the adjacent_smartsmol_mapidx as that will
# be rebuilt by the ResiduePadder init
return output_dict
# Keys to check for deserialized JSON
expected_json_keys = {
"rxn_smarts",
"adjacent_res_smarts",
"auto_blunt",
}
@classmethod
def _decode_object(cls, obj: dict[str, Any]):
adjacent_res_smarts = cls.access_with_deprecated_key(
obj, old_key="adjacent_smarts", new_key="adjacent_res_smarts"
)
residue_padder = cls(obj["rxn_smarts"], adjacent_res_smarts, obj.get("auto_blunt", False))
return residue_padder
# endregion
# Utility Functions
[docs]
def get_molAtomMapNumbers(mol: Chem.Mol) -> set[int]:
"""Return the set of mapping numbers in a molecule."""
return {atom.GetIntProp("molAtomMapNumber") for atom in mol.GetAtoms() if atom.HasProp("molAtomMapNumber")}
[docs]
def remove_unmapped_atoms_from_mol(mol: Chem.Mol) -> Chem.Mol:
"""Remove atoms without mapping numbers from a molecule."""
atoms_to_remove = [
atom.GetIdx() for atom in mol.GetAtoms()
if not atom.HasProp("molAtomMapNumber")
]
if len(atoms_to_remove) > 0:
mol = Chem.RWMol(mol)
for idx in sorted(atoms_to_remove, reverse=True):
mol.RemoveAtom(idx)
mol = mol.GetMol()
return mol
[docs]
def apply_atom_mappings(mcs_mol: Chem.Mol, original_mol: Chem.Mol) -> list[Chem.Mol]:
"""
Apply atom mappings from the original molecule to the MCS molecule by substructure match.
Be prepared for multiple matches, return a list for further evaluation.
"""
# Assumes original_mol contains mcs_mol
matches = original_mol.GetSubstructMatches(mcs_mol)
mapped_mcs_molecules = []
for match in matches:
rw_mcs_mol = Chem.RWMol(mcs_mol)
for i, mcs_atom in enumerate(rw_mcs_mol.GetAtoms()):
original_atom_idx = match[i]
original_atom = original_mol.GetAtomWithIdx(original_atom_idx)
if original_atom.HasProp("molAtomMapNumber"):
mcs_atom.SetProp("molAtomMapNumber", original_atom.GetProp("molAtomMapNumber"))
mapped_mcs_molecules.append(rw_mcs_mol.GetMol())
return mapped_mcs_molecules
[docs]
def remove_atoms_with_mapping(product: Chem.Mol, mapping_numbers: set) -> Chem.Mol:
"""Remove atoms with specific atom mapping numbers from a molecule."""
editable_product = Chem.RWMol(product)
atoms_to_remove = [
atom.GetIdx()
for atom in editable_product.GetAtoms()
if atom.HasProp("molAtomMapNumber") and int(atom.GetProp("molAtomMapNumber")) in mapping_numbers
]
for idx in sorted(atoms_to_remove, reverse=True):
editable_product.RemoveAtom(idx)
return editable_product.GetMol()
[docs]
class ResidueTemplate(BaseJSONParsable):
"""
Data and methods to pad rdkit molecules of polymer residues with parts of adjacent residues.
Attributes
----------
mol : Chem.Mol
molecule with the exact atoms that constitute the system.
All Hs are explicit, but atoms bonded to adjacent residues miss an H.
link_labels: dict[int, str]
Keys are indices of atoms that need padding
Values are strings to identify instances of ResiduePadder
atom_names: list[str]
list of atom names, matching order of atoms in rdkit mol
"""
def __init__(self, smiles, link_labels=None, atom_names=None):
"""
Initialize a ResidueTemplate object.
Parameters
----------
smiles : str
SMILES representation of the molecule.
link_labels : dict[int,str], optional
Keys are indices of atoms that need padding
Values are strings to identify instances of ResiduePadder
If None, no link atoms. The default is None.
atom_names : list[str], optional
List of atom names in the same order as the atoms in the given smiles.
"""
# Initializer attributes
self.link_labels = link_labels
self.atom_names = atom_names
# (JSON-bound) computed attributes
ps = Chem.SmilesParserParams()
ps.removeHs = False
mol = Chem.MolFromSmiles(smiles, ps)
self.check(mol, link_labels, atom_names)
self.mol = mol
# region JSON-interchange functions
[docs]
@classmethod
def json_encoder(cls, obj: "ResidueTemplate") -> Optional[dict[str, Any]]:
output_dict = {
"mol": rdMolInterchange.MolToJSON(obj.mol),
"link_labels": obj.link_labels,
"atom_name": obj.atom_names,
}
return output_dict
# Keys to check for deserialized JSON
expected_json_keys = {"mol", "link_labels", "atom_name"}
@classmethod
def _decode_object(cls, obj: dict[str, Any]):
# Converting ResidueTemplate init values that need conversion
deserialized_mol = rdkit_mol_from_json(obj.get("mol"))
# do not write canonical smiles to preserve original atom order
if deserialized_mol:
mol_smiles = rdkit.Chem.MolToSmiles(deserialized_mol, canonical=False)
# if dry json (data) is supplied
else:
mol_smiles = obj.get("smiles")
link_labels = convert_to_int_keyed_dict(obj.get("link_labels"))
atom_name = cls.access_with_deprecated_key(obj, old_key="atom_names", new_key="atom_name")
# Construct a ResidueTemplate object
residue_template = cls(mol_smiles, None, atom_name)
# Separately ensure that link_labels is restored to the value we expect it to be so there are not errors in
# the constructor
residue_template.link_labels = link_labels
return residue_template
# endregion
[docs]
def check(self, mol, link_labels, atom_names):
"""
Check the validity of a ResidueTemplate using the rdkit mol.
Parameters
----------
mol : Chem.Mol
The molecule to check.
link_labels : dict[int, str], optional
Keys are indices of atoms that need padding
Values are strings to identify instances of ResiduePadder
atom_names : list[str], optional
List of atom names in the same order as the atoms in the given smiles.
Raises
------
ValueError
If the number of atoms in the molecule does not match the length of atom_names.
RuntimeError
If the molecule is not a valid SMILES representation.
"""
have_implicit_hs = set()
for atom in mol.GetAtoms():
if atom.GetTotalNumHs() > 0:
have_implicit_hs.add(atom.GetIdx())
if link_labels is not None and set(link_labels) != have_implicit_hs:
raise ValueError(
f"expected any atom with non-real Hs ({have_implicit_hs}) to be in {link_labels=}"
)
if atom_names is None:
return
# data_lengths = set([len(values) for (_, values) in data.items()])
# if len(data_lengths) != 1:
# raise ValueError(f"each array in data must have the same length, but got {data_lengths=}")
# data_length = data_lengths.pop()
if len(atom_names) != mol.GetNumAtoms():
raise ValueError(f"{len(atom_names)=} differs from {mol.GetNumAtoms()=}")
return
[docs]
def match(self, input_mol):
"""
Match the input molecule with the template molecule and return the mapping.
Parameters
----------
input_mol : Chem.Mol
The input molecule to be matched with the template.
Returns
-------
tuple[dict, dict]
A tuple containing two dictionaries:
- The first dictionary contains the results of the matching, including the number of found and missing atoms.
- The second dictionary contains the mapping between the template and input molecule atom indices.
Raises
------
RuntimeError
If there are repeated values with different keys in the mapping.
"""
mapping = mapping_by_mcs(self.mol, input_mol)
mapping_inv = {value: key for (key, value) in mapping.items()}
if len(mapping_inv) != len(mapping):
raise RuntimeError(
f"bug in atom indices, repeated value different keys? {mapping=}"
)
# atoms "missing" exist in self.mol but not in input_mol
# "excess" atoms exist in input_mol but not in self.mol
result = {
"H": {"found": 0, "missing": 0, "excess": []},
"heavy": {"found": 0, "missing": 0, "excess": 0},
}
for atom in self.mol.GetAtoms():
element = "H" if atom.GetAtomicNum() == 1 else "heavy"
key = "found" if atom.GetIdx() in mapping else "missing"
result[element][key] += 1
for atom in input_mol.GetAtoms():
element = "H" if atom.GetAtomicNum() == 1 else "heavy"
if atom.GetIdx() not in mapping_inv:
if element == "H":
if atom.GetNeighbors():
nei_idx = atom.GetNeighbors()[0].GetIdx()
if nei_idx in mapping_inv:
result[element]["excess"].append(mapping_inv[nei_idx])
else:
result[element]["excess"].append(-1)
else: # lone hydrogen found in monomer
monomer_info = getPdbInfoNoNull(atom)
if monomer_info:
logger.warning(f"WARNING: Lone hydrogen is ignored: \n"
f" {monomer_info} \n")
else:
logger.warning(f"WARNING: A lone hydrogen is ignored during monomer-template matching. \n")
else:
result[element]["excess"] += 1
return result, mapping