Source code for meeko.polymer

import pathlib
import json
import logging
import traceback
from importlib.resources import files
import warnings
eol="\n"
from sys import exc_info
from typing import Union
from typing import Optional
from typing import Any

import rdkit.Chem
from rdkit import Chem
from rdkit.Chem import rdFMCS
from rdkit.Chem import rdChemReactions
from rdkit.Chem import rdMolInterchange
from rdkit.Geometry import Point3D

from .molsetup import RDKitMoleculeSetup
from .molsetup import MoleculeSetup
from .utils.jsonutils import BaseJSONParsable
from .utils.jsonutils import serialize_optional
from .utils.jsonutils import rdkit_mol_from_json
from .utils.jsonutils import convert_to_int_keyed_dict
from .utils.rdkitutils import mini_periodic_table
from .utils.rdkitutils import react_and_map
from .utils.rdkitutils import AtomField
from .utils.rdkitutils import build_one_rdkit_mol_per_altloc
from .utils.rdkitutils import _aux_altloc_mol_build
from .utils.rdkitutils import covalent_radius
from .utils.pdbutils import PDBAtomInfo
from .utils.rdkitutils import getPdbInfoNoNull
from .chemtempgen import export_chem_templates_to_json
from .chemtempgen import build_noncovalent_CC
from .chemtempgen import build_linked_CCs

import numpy as np

data_path = files("meeko") / "data"
periodic_table = Chem.GetPeriodicTable()

try:
    import prody
except ImportError as _prody_import_error:
    ALLOWED_PRODY_TYPES = None
    AtomGroup = None
    Selection = None
    def prody_to_rdkit(*args):
        raise ImportError(_prody_import_error)
else:
    from .utils.prodyutils import prody_to_rdkit, ALLOWED_PRODY_TYPES
    from prody.atomic.atomgroup import AtomGroup
    from prody.atomic.selection import Selection


logger = logging.getLogger(__name__)
rdkit_logger = logging.getLogger("rdkit")

residues_rotamers = {
    "SER": [("C", "CA", "CB", "OG")],
    "THR": [("C", "CA", "CB", "CG2")],
    "CYS": [("C", "CA", "CB", "SG")],
    "VAL": [("C", "CA", "CB", "CG1")],
    "HIS": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
    "ASN": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "ND2")],
    "ASP": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "OD1")],
    "ILE": [("C", "CA", "CB", "CG2"), ("CA", "CB", "CG2", "CD1")],
    "LEU": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD1")],
    "PHE": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
    "TYR": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
    "TRP": [("C", "CA", "CB", "CG"), ("CA", "CB", "CG", "CD2")],
    "GLU": [
        ("C", "CA", "CB", "CG"),
        ("CA", "CB", "CG", "CD"),
        ("CB", "CG", "CD", "OE1"),
    ],
    "GLN": [
        ("C", "CA", "CB", "CG"),
        ("CA", "CB", "CG", "CD"),
        ("CB", "CG", "CD", "OE1"),
    ],
    "MET": [
        ("C", "CA", "CB", "CG"),
        ("CA", "CB", "CG", "SD"),
        ("CB", "CG", "SD", "CE"),
    ],
    "ARG": [
        ("C", "CA", "CB", "CG"),
        ("CA", "CB", "CG", "CD"),
        ("CB", "CG", "CD", "NE"),
        ("CG", "CD", "NE", "CZ"),
    ],
    "LYS": [
        ("C", "CA", "CB", "CG"),
        ("CA", "CB", "CG", "CD"),
        ("CB", "CG", "CD", "CE"),
        ("CG", "CD", "CE", "NZ"),
    ],
}


[docs] def find_graph_paths(graph, start_node, end_nodes, current_path=(), paths_found=()): """ (DFS Path Search) Recursively find all paths from a start node to any of the specified end nodes in a directed graph. Parameters ---------- graph : dict[int, list[int]] A dictionary representing a graph. Keys are node IDs (integers), and values are lists of neighboring node IDs. start_node : int The node from which to begin path traversal. end_nodes : iterable of int A set or list of node IDs that are valid end points for a path. current_path : tuple[int], optional The current traversal path (used during recursion). Default is empty tuple. paths_found : list[list[int]], optional The list of complete paths found so far. Default is empty tuple, which will be converted internally. Returns ------- list[list[int]] A list of paths. Each path is represented as a list of node IDs from start_node to an end_node. """ current_path = current_path + (start_node,) paths_found = list(paths_found) for node in graph[start_node]: if node in current_path: continue if node in end_nodes: paths_found.append(list(current_path) + [node]) more_paths = find_graph_paths(graph, node, end_nodes, current_path) paths_found.extend(more_paths) return paths_found
[docs] def find_inter_mols_bonds(mols_dict): """ Finds inter-residue bonds between the RDKit Mols in mols_dict based on distances compared to built-in covalent radii (.utils.rdkitutils.covalent_radius). Parameters ---------- mols_dict : dict[str, (Chem.Mol, str)] A dictionary to store raw input mols and input residue names by residue IDs. The keys are residue IDs in the format <chain>:<resnum> such as "A:42", and the values are tuples of an RDKit Mols and input resname. Returns ------- dict[(str, str), list[tuple[int, int]]] A dictionary representing inter-residue bonds. The keys are tuples of residue IDs in the format <chain>:<resnum>, and the values are lists of tuples of atom indices in the two residues that are bonded. """ allowance = 1.2 # vina uses 1.1 but covalent radii are shorter here max_possible_covalent_radius = ( 2 * allowance * max([r for k, r in covalent_radius.items()]) ) cubes_min = [] cubes_max = [] for key, (mol, _) in mols_dict.items(): positions = mol.GetConformer().GetPositions() cubes_min.append(np.min(positions, axis=0)) cubes_max.append(np.max(positions, axis=0)) tmp = np.array([0, 0, 1, 1]) pairs_to_consider = [] keys = list(mols_dict) for i in range(len(mols_dict)): for j in range(i + 1, len(mols_dict)): do_consider = True for d in range(3): x = (cubes_min[i][d], cubes_max[i][d], cubes_min[j][d], cubes_max[j][d]) idx = np.argsort(x) has_overlap = tmp[idx][0] != tmp[idx][1] close_enough = abs(x[idx[1]] - x[idx[2]]) < max_possible_covalent_radius do_consider &= close_enough or has_overlap if do_consider: pairs_to_consider.append((i, j)) bonds = {} # key is pair mol indices, valuei is list of pairs of atom indices for i, j in pairs_to_consider: p1 = mols_dict[keys[i]][0].GetConformer().GetPositions() p2 = mols_dict[keys[j]][0].GetConformer().GetPositions() for a1 in mols_dict[keys[i]][0].GetAtoms(): for a2 in mols_dict[keys[j]][0].GetAtoms(): vec = p1[a1.GetIdx()] - p2[a2.GetIdx()] distsqr = np.dot(vec, vec) # check if atom has implemented covalent radius for atom in [a1, a2]: if atom.GetAtomicNum() not in covalent_radius: raise RuntimeError(f"Element {periodic_table.GetElementSymbol(atom.GetAtomicNum())} doesn't have an implemented covalent radius, which was required for the perception of intermolecular bonds. ") cov_dist = ( covalent_radius[a1.GetAtomicNum()] + covalent_radius[a2.GetAtomicNum()] ) if distsqr < (allowance * cov_dist) ** 2: key = (keys[i], keys[j]) value = (a1.GetIdx(), a2.GetIdx()) bonds.setdefault(key, []) bonds[key].append(value) return bonds
[docs] def mapping_by_mcs(mol, ref): """ Find an atom-atom mapping between two molecules based on the maximum common substructure (MCS). Parameters ---------- mol : Chem.Mol The first molecule to compare. ref : Chem.Mol The second molecule to compare. Returns ------- dict[int, int] A dictionary mapping atom indices from the first molecule to the second molecule. The keys are atom indices from the first molecule, and the values are atom indices from the second molecule. """ mcs_result = rdFMCS.FindMCS([mol, ref], bondCompare=rdFMCS.BondCompare.CompareAny) mcs_mol = Chem.MolFromSmarts(mcs_result.smartsString) mol_idxs = mol.GetSubstructMatch(mcs_mol) ref_idxs = ref.GetSubstructMatch(mcs_mol) atom_map = {i: j for (i, j) in zip(mol_idxs, ref_idxs)} return atom_map
def _snap_to_int(value, tolerance=0.12): """ Rounds a floating-point number to the nearest integer within a specified tolerance. Parameters ---------- value : float The floating-point number to be rounded. tolerance : float, optional The tolerance within which the number is considered close to an integer. Default is 0.12. Returns ------- int or None The rounded integer if the value is within the tolerance of an integer, otherwise None. """ for inc in [-1, 0, 1]: if abs(value - int(value) - inc) <= tolerance: return int(value) + inc return None
[docs] def divide_int_gracefully(integer, weights, allow_equal_weights_to_differ=False): """ Divides an integer into parts based on the provided weights, ensuring that the sum of the parts equals the integer. Parameters ---------- integer : int The integer to be divided. weights : list[float] A list of weights that determine the proportions of the integer. allow_equal_weights_to_differ : bool, optional If True, allows equal weights to be treated as different groups. Default is False (equal inputs get equal outputs). Returns ------- list[float] A list of floats representing the divided integer based on the weights. """ for weight in weights: if type(weight) not in [int, float] or weight < 0: raise ValueError("weights must be numeric and non-negative") if type(integer) is not int: raise ValueError("integer must be integer") inv_total_weight = 1.0 / sum(weights) shares = [w * inv_total_weight for w in weights] # normalize result = [_snap_to_int(integer * s, tolerance=0.5) for s in shares] surplus = integer - sum(result) if surplus == 0: return result data = [(i, w) for (i, w) in enumerate(weights)] data = sorted(data, key=lambda x: x[1], reverse=True) idxs = [i for (i, _) in data] if allow_equal_weights_to_differ: groups = [1 for _ in weights] else: groups = [] last_weight = None for i in idxs: if weights[i] == last_weight: groups[-1] += 1 else: groups.append(1) last_weight = weights[i] # iterate over all possible combinations of groups # this is potentially very slow nr_groups = len(groups) for j in range(1, 2**nr_groups): n_changes = 0 combo = [] for grpidx in range(nr_groups): is_changed = bool(j & 2**grpidx) combo.append(is_changed) n_changes += is_changed * groups[grpidx] if n_changes == abs(surplus): break # add or subtract 1 to distribute surplus increment = surplus / abs(surplus) index = 0 for i, is_changed in enumerate(combo): if is_changed: for j in range(groups[i]): result[idxs[index]] += increment index += 1 return result
[docs] def rectify_charges(q_list, net_charge=None, decimals=3) -> list[float]: """ Makes charges N (default is 3) decimals in length and ensures they sum to an integer. Parameters ---------- q_list : list[float] List of charges to be rectified. net_charge : int or None The desired net charge. If None, it will be calculated from the sum of the charges in q_list. decimals : int The number of decimal places to round the charges to. Default is 3. Returns ------- charges_dec: list[float] List of rectified charges, rounded to the specified number of decimal places. """ fstr = "%%.%df" % decimals charges_dec = [float(fstr % q) for q in q_list] if net_charge is None: net_charge = _snap_to_int(sum(charges_dec), tolerance=0.15) if net_charge is None: msg = "net charge could not be predicted from input q_list. (residual is beyond tolerance) " msg = "Please set the net_charge argument directly" raise RuntimeError(msg) elif type(net_charge) != int: raise TypeError("net charge must be an integer") surplus = net_charge - sum(charges_dec) surplus_int = _snap_to_int(10**decimals * surplus) if surplus_int == 0: return charges_dec weights = [abs(q) for q in q_list] surplus_int_splits = divide_int_gracefully(surplus_int, weights) for i, increment in enumerate(surplus_int_splits): charges_dec[i] += 10**-decimals * increment return charges_dec
[docs] def get_updated_positions(monomer, new_positions: dict): """ Returns full set of positions for the rdkit_mol in monomer given a partial set of new_positions. Hydrogens not specified in new_positions will have their position reset by RDKit if they are one or two bonds away from an atom in new_positions. Parameters ---------- monomer : Monomer Must have rdkit_mol to associate with new positions. new_positions : dict[int, (float, float, float)] Dictionary mapping the atom indices to their new positions. Returns ------- positions : np.ndarray The updated positions of the atoms in the RDKit molecule. """ h_to_update = set() mol = Chem.Mol(monomer.rdkit_mol) # avoids side effects conformer = mol.GetConformer() for n1 in (mol.GetAtomWithIdx(idx) for idx in new_positions): for n2 in n1.GetNeighbors(): if n2.GetAtomicNum() == 1: # 1 bond away h_to_update.add(n2.GetIdx()) else: if n2.GetIdx() not in new_positions: # 2 bonds away h_to_update.update(set(n2h.GetIdx() for n2h in n2.GetNeighbors() if n2h.GetAtomicNum() == 1)) # hydrogens in new_positions shall not be updated by RDKit h_to_update -= set(new_positions) for index in new_positions: x, y, z = new_positions[index] p = Point3D(float(x), float(y), float(z)) conformer.SetAtomPosition(index, p) if h_to_update: update_H_positions(mol, list(h_to_update)) return mol.GetConformer().GetPositions()
[docs] def update_H_positions(mol: Chem.Mol, indices_to_update: list[int]) -> None: """ Re-calculates the position of some hydrogens already existing in the mol. Does not guarantee that chirality can be preserved. Parameters ---------- mol : Chem.Mol RDKit Mol object with hydrogens. indices_to_update : list[int] Hydrogen indices to update. Returns ------- None Raises ------ RuntimeError: If a provided index in indices_to_update is not a Hydrogen, if a Hydrogen only has Hydrogen neighbors, or if the number of visited Hydrogens does not match the number of Hydrogens marked to be deleted. """ # Gets the conformer and a readable and writable version of the Mol using RDKit conf = mol.GetConformer() tmpmol = Chem.RWMol(mol) # Sets up data structures to manage Hydrogens to delete and add to_del = {} to_add_h = [] # Loops through input indices_to_update, checks index validity, adds data to the addition and deletion data structs for h_index in indices_to_update: # Checks that the atom at this index is a Hydrogen atom = tmpmol.GetAtomWithIdx(h_index) if atom.GetAtomicNum() != 1: raise RuntimeError("only H positions can be updated") # Ensures that all Hydrogens have at least 1 non-Hydrogen neighbor heavy_neighbors = [] for neigh_atom in atom.GetNeighbors(): if neigh_atom.GetAtomicNum() != 1: heavy_neighbors.append(neigh_atom) if len(heavy_neighbors) != 1: raise RuntimeError( f"hydrogens must have 1 non-H neighbor, got {len(heavy_neighbors)}" ) # Adds the first neighbor to the addition and deletion data structures. to_add_h.append(heavy_neighbors[0]) to_del[h_index] = heavy_neighbors[0] # Loops through the delete list and deletes the for i in sorted(to_del, reverse=True): tmpmol.RemoveAtom(i) to_del[i].SetNumExplicitHs(to_del[i].GetNumExplicitHs() + 1) to_add_h = list(set([atom.GetIdx() for atom in to_add_h])) tmpmol = tmpmol.GetMol() tmpmol.UpdatePropertyCache() Chem.SanitizeMol(tmpmol) tmpmol = Chem.AddHs(tmpmol, onlyOnAtoms=to_add_h, addCoords=True) tmpconf = tmpmol.GetConformer() used_h = ( set() ) # heavy atom may have multiple H that were missing, keep track of Hs that were visited for h_index, parent in to_del.items(): for atom in tmpmol.GetAtomWithIdx(parent.GetIdx()).GetNeighbors(): has_new_position = atom.GetIdx() >= mol.GetNumAtoms() - len(to_del) if atom.GetAtomicNum() == 1 and has_new_position: if atom.GetIdx() not in used_h: # print(h_index, tuple(tmpconf.GetAtomPosition(atom.GetIdx()))) conf.SetAtomPosition( h_index, tmpconf.GetAtomPosition(atom.GetIdx()) ) used_h.add(atom.GetIdx()) break # h_index coords copied, don't look into further H bound to parent # no guarantees about preserving chirality, which we don't need if len(used_h) != len(to_del): raise RuntimeError( f"Updated {len(used_h)} H positions but deleted {len(to_del)}" ) return
def _delete_residues(res_to_delete, raw_input_mols): """ Deletes residues from raw_input_mols that are not needed for the Polymer object. Parameters ---------- res_to_delete : list[str] or None residue IDs to delete in format <chain>:<resnum><icode> raw_input_mols : dict[str, Chem.Mol] keys are residue IDs Returns ------- None Raises ------ ValueError: If any of the residues to delete are not found in raw_input_mols. """ if res_to_delete is None: return missing = set() for res in res_to_delete: if res not in raw_input_mols: missing.add(res) raw_input_mols.pop(res, None) if len(missing) > 0: msg = "can't find the following residues to delete: " + " ".join(missing) raise ValueError(msg) return
[docs] class PolymerCreationError(RuntimeError): """ Exception raised when the creation of a Polymer object fails. Attributes ---------- error : str The main error message. recommendations : str Recommendations for resolving the error. traceback : str The traceback of the error. """ def __init__(self, error: str, recommendations: str = None): """ Initialize the PolymerCreationError. Parameters ---------- error : str The main error message. recommendations : str Recommendations for resolving the error. """ super().__init__(error) # main error message to pass to RuntimeError self.error = error self.recommendations = recommendations exc_type, exc_value, exc_traceback = exc_info() if exc_value is not None: self.traceback = ''.join(traceback.format_exception(exc_type, exc_value, exc_traceback)) else: self.traceback = None def __str__(self): """ Return a string representation of the error message. Returns ------- str The error message. """ msg = "" + eol msg += "Error: Creation of data structure for receptor failed." + eol msg += "" + eol msg += "Details:" + eol msg += self.error + eol if self.traceback: msg += self.traceback + eol if self.recommendations: msg += "Recommendations:" + eol msg += self.recommendations + eol msg += "" + eol return msg
[docs] def handle_parsing_situations( unmatched_res, unparsed_res, allow_bad_res, res_missed_altloc, res_needed_altloc, ): """ Handle situations where residues are not parsed correctly or do not match templates. Parameters ---------- unmatched_res : set Residues that did not match any template. unparsed_res : set Residues that could not be parsed. allow_bad_res : bool If True, allows residues that do not match templates to be ignored. res_missed_altloc : set Residues that were requested with an alternate location but were not found. res_needed_altloc : set Residues that were found with an alternate location but were not requested. Raises ------ PolymerCreationError If there are residues that could not be parsed or matched to templates, and allow_bad_res is False. """ err = "" if unparsed_res: msg = f"- Parsing failed for: {unparsed_res}." if not allow_bad_res: err += msg + eol else: msg += " Ignored due to allow_bad_res." logger.warning(msg) if unmatched_res: msg = f"- Template matching failed for: {list(unmatched_res)}" if not allow_bad_res: err += msg + eol else: msg += " Ignored due to allow_bad_res." logger.warning(msg) if err: err += "These residues can be ignored with option allow_bad_res." + eol if res_needed_altloc: msg = f"- Residues with alternate location: {res_needed_altloc}" + eol msg += "Either specify an altloc for each with option wanted_altloc" + eol msg += "or a general default altloc with option default_altloc." err += msg if res_missed_altloc: msg = f"- Requested altlocs not found for: {res_missed_altloc}." + eol err += msg if err: recs = "1. (for batch processing) Use -a/--allow_bad_res to automatically remove residues" + eol recs += "that do not match templates, and --default_altloc to set" + eol recs += "a default altloc variant. Use these at your own risk." + eol recs += "" + eol recs += "2. (processing individual structure) Inspecting and fixing the input structure is recommended." + eol recs += "Use --wanted_altloc to set variants for specific residues." raise PolymerCreationError(err, recs) return
[docs] class ResidueChemTemplates(BaseJSONParsable): """ Holds template data required to initialize Polymer Attributes ---------- residue_templates: dict[str, ResidueTemplate] keys are the ID of an instance of ResidueTemplate padders : dict instances of ResiduePadder keyed by a link_label (a string) link_labels establish the relationship between ResidueTemplates and ResiduePadders, determining which padder is to be used to pad each atom of an instance of Monomer that needs padding. ambiguous : dict[str, list[str]] mapping between input residue names (e.g. the three-letter residue name from PDB files) and IDs (strings) of ResidueTemplates """ def __init__(self, residue_templates, padders, ambiguous): """ Initialize ResidueChemTemplates. Parameters ---------- residue_templates : dict[str, ResidueTemplate] A dictionary of residue templates. Keys are the IDs of the templates, and values are instances of ResidueTemplate. padders : dict[str, ResiduePadder] A dictionary of residue padders. Keys are the IDs of the padders, and values are instances of ResiduePadder. ambiguous : dict[str, list[str]] Mapping between input residue names (e.g. the three-letter residue name from PDB files) and IDs (strings) of ResidueTemplates. """ self._check_missing_padders(residue_templates, padders) self._check_ambiguous_reskeys(residue_templates, ambiguous) self.residue_templates = residue_templates self.padders = padders self.ambiguous = ambiguous # region JSON-interchange functions
[docs] @classmethod def json_encoder(cls, obj: "ResidueChemTemplates") -> Optional[dict[str, Any]]: output_dict = { "residue_templates": { k: ResidueTemplate.json_encoder(v) for k, v in obj.residue_templates.items() }, "ambiguous": obj.ambiguous, "padders": { k: ResiduePadder.json_encoder(v) for k, v in obj.padders.items() }, } return output_dict
# Keys to check for deserialized JSON expected_json_keys = { "residue_templates", "ambiguous", "padders", } @classmethod def _decode_object(cls, obj: dict[str, Any]): # Extracting the constructor args from the json representation and creating a ResidueChemTemplates instance templates = { k: ResidueTemplate.from_dict(v) for k, v in obj["residue_templates"].items() } padders = {k: ResiduePadder.from_dict(v) for k, v in obj["padders"].items()} residue_chem_templates = cls(templates, padders, obj["ambiguous"]) return residue_chem_templates # endregion
[docs] def add_dict(self, data, overwrite=False): """ Add or update data from a dictionary to the current instance. Parameters ---------- data : dict A dictionary containing new data to be added. The dictionary may contain keys within "residue_templates", "padders", and "ambiguous". overwrite : bool, optional If True, existing data will be overwritten with new data. If False, new data will be added without overwriting existing data. Default is False. """ bad_keys = set(data) - {"ambiguous", "residue_templates", "padders"} if bad_keys: logging.warning(f"Ignore unexpected keys: {bad_keys}") new_ambiguous = data.get("ambiguous", {}) if overwrite: self.ambiguous.update(new_ambiguous) else: new_ambiguous = {k: v.copy() for k, v in new_ambiguous.items()} new_ambiguous.update(self.ambiguous) self.ambiguous = new_ambiguous for key, value in data.get("residue_templates", {}).items(): if overwrite or key not in self.residue_templates: res_template = ResidueTemplate.from_dict(value) self.residue_templates[key] = res_template for link_label, value in data.get("padders", {}).items(): if overwrite or key not in self.padders: padder = ResiduePadder.from_dict(data) self.padders[link_label] = padder return
[docs] @staticmethod def lookup_filename(filename, data_path): """ Look for a file in the current directory or in the data_path directory. If the file is not found in either location, raise a ValueError. If the file is found in the data_path directory, return the full path. If the file is found in the current directory, return the filename. Parameters ---------- filename : str The name of the file to look for. data_path : pathlib.Path The path to the directory where the file may be located. Returns ------- str The full path to the file if found, otherwise raises ValueError. Raises ------ ValueError If the file is not found in either the current directory or the data_path directory. """ p = pathlib.Path(filename) if not p.exists(): if (data_path / p).exists(): filename = str(data_path / p) elif (data_path / (p.name + ".json")).exists(): filename = str(data_path / (p.name + ".json")) else: raise ValueError(f"can't find {filename} in current dir or {data_path}") return filename
[docs] @classmethod def from_json_file(cls, filename): """ Create an instance of ResidueChemTemplates from a JSON file. Parameters ---------- filename : str The name of the JSON file to read. The file may contain residue templates, padders, and ambiguous residue names. Returns ------- ResidueChemTemplates An instance of ResidueChemTemplates created from the JSON file. """ filename = cls.lookup_filename(filename, data_path) with open(filename) as f: jsonstr = f.read() alldata = json.loads(jsonstr) ambiguous = {k: v.copy() for k,v in alldata.get("ambiguous", {}).items()} residue_templates = {} padders = {} for key, data in alldata.get("residue_templates", {}).items(): res_template = ResidueTemplate.from_dict(data) residue_templates[key] = res_template for link_label, data in alldata.get("padders", {}).items(): padders[link_label] = ResiduePadder.from_dict(data) return cls(residue_templates, padders, ambiguous)
[docs] @classmethod def create_from_defaults(cls): """ Create an instance of ResidueChemTemplates using default data (residue_chem_templates.json). Returns ------- ResidueChemTemplates An instance of ResidueChemTemplates created from the default JSON file. """ return cls.from_json_file("residue_chem_templates")
[docs] def add_json_file(self, filename): """ Add data from a JSON file to the current instance. Parameters ---------- filename : str The name of the JSON file to read. The file may contain residue templates, padders, and ambiguous residue names. Returns ------- None """ filename = self.lookup_filename(filename, data_path) with open(filename) as f: jsonstr = f.read() data = json.loads(jsonstr) self.add_dict(data) return
@staticmethod def _check_missing_padders(residue_templates, padders): # can't guarantee full coverage because the topology that is passed # to the Polymer may contain bonds between residues that are not # anticipated to be bonded, for example, protein N-term bonded to # nucleic acid 5 prime. # collect labels from residues link_labels_in_residues = set() for reskey, res_template in residue_templates.items(): for _, link_label in res_template.link_labels.items(): link_labels_in_residues.add(link_label) # and check we have padders for all of them link_labels_in_padders = set([label for label in padders]) # for link_label in padders: # for (link_labels) in padder.link_labels: # print(link_key, link_labels) # for (label, _) in link_labels: # link_labels is a list of pairs # link_labels_in_padders.add(label) missing = link_labels_in_residues.difference(link_labels_in_padders) if missing: raise RuntimeError(f"missing padders for {missing}") return @staticmethod def _check_ambiguous_reskeys(residue_templates, ambiguous): missing = {} for input_resname, reskeys in ambiguous.items(): for reskey in reskeys: if reskey not in residue_templates: missing.setdefault(input_resname, set()) missing[input_resname].add(reskey) if len(missing): raise ValueError(f"missing residue templates for ambiguous: {missing}") return
[docs] class Polymer(BaseJSONParsable): """ Represents polymer with its subunits as individual RDKit molecules. Used for proteins and nucleic acids. The key class is Monomer, which contains, a padded RDKit molecule containing part of the adjacent residues to enable chemically meaningful parameterizaion. Instances of ResidueTemplate make sure that the input, which may originate from a PDB string, matches the RDKit molecule of the template, even if hydrogens are missing. Attributes ---------- monomers : dict[str, Monomer] Dictionary to store monomers where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and values are instances of Monomer. log : dict[str, list[str]] Dictionary to store log messages during monomer processing (_get_monomer). residue_chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class used to initialize this Polymer. raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary of raw input mols used to initialize this Polymer, where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and values are tuples of an RDKit Mols and input resname. """ def __init__( self, raw_input_mols: dict[str, tuple[Chem.Mol, str]], bonds: dict[tuple[str, str], tuple[int, int]], residue_chem_templates: ResidueChemTemplates, mk_prep=None, set_template: dict[str, str] = None, blunt_ends: list[tuple[str, int]] = None, get_atomprop_from_raw: dict = None, ): """ Initialize Polymer. Parameters ---------- raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary of raw input mols where keys are residue IDs in the format <chain>:<resnum> such as "A:42" and values are tuples of an RDKit Mols and input resname. RDKit Mols will be matched to instances of ResidueTemplate, and may contain none, all, or some of the Hydrogens. bonds : dict[tuple[str, str], tuple[int, int]] A dictionary of inter-residue bonds where keys are tuples of residue IDs in the format <chain>:<resnum> such as "A:42" and values are tuples of atom indices. residue_chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class. mk_prep : MoleculePreparation An instance of the MoleculePreparation class to parameterize the padded molecules. set_template : dict[str, str] A dict mapping residue IDs in the format <chain>:<resnum> such as "A:42" to ResidueTemplate instances. blunt_ends : list[tuple[str, int]] A list of tuples where each tuple is residue IDs and 0-based atom index, e.g.; ("A:42", 0) get_atomprop_from_raw : dict[str, any] A dictionary mapping atom properties to pass from raw_input_mols. The keys are the property names and the values are the default values. Returns ------- None Raises ------ ValueError If raw_input_mols is not a dictionary or if any of the residues in set_template are not found in raw_input_mols. PolymerCreationError If there are residues that could not be parsed or matched to templates, and allow_bad_res is False. """ # TODO simplify SMARTS for adjacent res in padders if type(raw_input_mols) != dict: msg = f"expected raw_input_mols to be dict, got {type(raw_input_mols)}" if type(raw_input_mols) == str: msg += eol msg += ( "consider Polymer.from_pdb_string(pdbstr)" + eol ) raise ValueError(msg) self.residue_chem_templates = residue_chem_templates residue_templates = residue_chem_templates.residue_templates padders = residue_chem_templates.padders ambiguous = residue_chem_templates.ambiguous if set_template is None: set_template = {} else: # make sure all resiude_id in set_template exist missing = set( [ residue_id for residue_id in set_template if residue_id not in raw_input_mols ] ) if len(missing): raise ValueError( f"Residue IDs in set_template not found: {missing} {raw_input_mols.keys()}" ) # check if input assigned residue name in residue_templates err = "" supported_resnames = residue_templates.keys() | ambiguous.keys() unknown_res_from_input = {res_id: raw_input_mols[res_id][1] for res_id in raw_input_mols if res_id not in set_template and raw_input_mols[res_id][1] not in supported_resnames } if unknown_res_from_input: unknown_valid_res_from_input = {k: v for k, v in unknown_res_from_input.items() if v != "UNL"} if unknown_valid_res_from_input: err += f"Input residues {unknown_valid_res_from_input} not in residue_templates" + eol UNL_from_input = {k: v for k, v in unknown_res_from_input.items() if v == "UNL"} if UNL_from_input: err += f"Input residues {UNL_from_input} do not have a concrete definition" + eol unknown_res_from_assign = {} if set_template: unknown_res_from_assign = {res_id: resn for res_id, resn in set_template.items() if resn not in supported_resnames} unknown_valid_res_from_assign = {k: v for k, v in unknown_res_from_assign.items() if v != "UNL"} if unknown_valid_res_from_assign: err += f"Input residues {unknown_valid_res_from_assign} not in residue_templates" + eol UNL_from_assign = {k: v for k, v in unknown_res_from_assign.items() if v == "UNL"} if UNL_from_assign: err += f"Input residues {UNL_from_assign} do not have a concrete definition" + eol if err: if "UNL" in err: err += "Resdiues that are named UNL can't be parameterized. " + eol rec = "1. (to parameterize the residues) Use --set_template to specify valid residue names, " + eol rec += "2. (to skip the residues) Use --delete_residues to ignore them. Residues will be deleted from the prepared receptor. " raise PolymerCreationError(err, rec) warnings.warn(err, RuntimeWarning) warnings.warn("Trying to resolve unknown residues by building chemical templates... ", RuntimeWarning) all_unknown_res = unknown_res_from_input.copy() all_unknown_res.update(unknown_res_from_assign) bonded_unknown_res = {res_id: all_unknown_res[res_id] for res_id in all_unknown_res if any(res_id in respair for respair in bonds)} unbound_unknown_res = all_unknown_res.copy() for key in bonded_unknown_res: unbound_unknown_res.pop(key, None) if unbound_unknown_res: for resname in set(unbound_unknown_res.values()): try: cc = build_noncovalent_CC(resname) fetch_template_dict = json.loads(export_chem_templates_to_json([cc]))['residue_templates'][cc.resname] residue_templates.update({resname: ResidueTemplate( smiles = fetch_template_dict['smiles'], atom_names = fetch_template_dict['atom_name'], link_labels = fetch_template_dict['link_labels'])}) ambiguous[resname] = [cc.resname] except Exception as e: logger.warning(f"Failed building template from CCD for {resname=}") raise PolymerCreationError(str(e)) if bonded_unknown_res: failed_build = set() try: for resname in set(bonded_unknown_res.values()): cc_list = build_linked_CCs(resname) if not cc_list: failed_build.add(resname) else: for cc in cc_list: fetch_template_dict = json.loads(export_chem_templates_to_json([cc]))['residue_templates'][cc.resname] residue_templates.update({cc.resname: ResidueTemplate( smiles = fetch_template_dict['smiles'], atom_names = fetch_template_dict['atom_name'], link_labels = convert_to_int_keyed_dict(fetch_template_dict['link_labels']))}) if resname in ambiguous: ambiguous[resname].append(cc.resname) else: ambiguous[resname] = [cc.resname] except Exception as e: raise PolymerCreationError(str(e)) if failed_build: raise PolymerCreationError(f"Template generation failed for unknown residues: {failed_build}, which appear to be linking fragments. " + eol + "Generation of chemical templates with modified backbones, which involves guessing of linker positions and types, are not currently supported. ", "1. (to parameterize the residues) Use --add_templates to pass the additional templates with valid linker_labels, " + eol + "2. (to skip the residues) Use --delete_residues to ignore them. Residues will be deleted from the prepared receptor. ") self.monomers, self.log = self._get_monomers( raw_input_mols, ambiguous, residue_chem_templates, set_template, bonds, blunt_ends, ) _bonds = {} for key, bond_list in bonds.items(): monomer1 = self.monomers[key[0]] monomer2 = self.monomers[key[1]] if monomer1.rdkit_mol is None or monomer2.rdkit_mol is None: continue invmap1 = {j: i for i, j in monomer1.mapidx_to_raw.items()} invmap2 = {j: i for i, j in monomer2.mapidx_to_raw.items()} _bonds[key] = [(invmap1[b[0]], invmap2[b[1]]) for b in bond_list] bonds = _bonds # padding may seem overkill but we had to run a reaction anyway for h_coord_from_dipep padded_mols = self._build_padded_mols(self.monomers, bonds, padders) for residue_id, (padded_mol, mapidx_from_pad) in padded_mols.items(): monomer = self.monomers[residue_id] monomer.padded_mol = padded_mol monomer.molsetup_mapidx = mapidx_from_pad if mk_prep is not None: self.parameterize(mk_prep, get_atomprop_from_raw = get_atomprop_from_raw) return # region JSON-interchange functions
[docs] @classmethod def json_encoder(cls, obj: "Polymer") -> Optional[dict[str, Any]]: output_dict = { "residue_chem_templates": ResidueChemTemplates.json_encoder( obj.residue_chem_templates ), "monomers": { k: Monomer.json_encoder(v) for k, v in obj.monomers.items() }, "log": obj.log, } return output_dict
# Keys to check for deserialized JSON expected_json_keys = { "residue_chem_templates", "monomers", "log", } @classmethod def _decode_object(cls, obj: dict[str, Any]): # Deserializes ResidueChemTemplates from the dict to use as an input, then constructs a Polymer object # and sets its values using deserialized JSON values. residue_chem_templates = ResidueChemTemplates.from_dict( obj["residue_chem_templates"] ) polymer = cls({}, {}, residue_chem_templates) polymer.monomers = { k: Monomer.from_dict(v) for k, v in obj["monomers"].items() } polymer.log = obj["log"] return polymer # endregion
[docs] def stitch(self, residues_to_add: Optional[set[str]] = None, bonds_to_use: Optional[dict[tuple[str], list[tuple[int]]]] = None): """ Function to stitch together monomers into a single molecule. Parameters ---------- residues_to_add : set[str], optional A set of residue IDs to add to the stitched molecule. If None, all valid monomers will be added. Default is None (all valid monomers). bonds_to_use : dict[tuple[str], list[tuple[int]]], optional A dictionary of bonds to use for stitching. The keys are tuples of residue IDs, and the values are lists of tuples of atom indices (in rdkit_mol). If None, all available bonds in the polymer will be used. Default is None (all available bonds). Returns ------- Chem.Mol An RDKit molecule that results from adding bonds between the specified residues. It may contain multiple fragments if there are multiple chains or gaps. """ # stitching all valid monomers by default valid_monomers = set(self.get_valid_monomers().keys()) residues_to_add = residues_to_add or valid_monomers residues_to_add = set(residues_to_add) # verify if requested monomers are valid (have rdkit_mol) invalid_monomers = residues_to_add - valid_monomers if invalid_monomers: raise ValueError(f"Residue IDs not in valid monomers: {invalid_monomers}") if bonds_to_use is None: bonds_to_use = {} resid_to_rawmols = {res_id: (self.monomers[res_id].raw_rdkit_mol, self.monomers[res_id].input_resname) for res_id in residues_to_add} bonds_indexed_in_raw = find_inter_mols_bonds(resid_to_rawmols) invmaps = { res_id: {j: i for i, j in self.monomers[res_id].mapidx_to_raw.items()} for res_id in residues_to_add } for (res1, res2), bond_list in bonds_indexed_in_raw.items(): invmap1, invmap2 = invmaps[res1], invmaps[res2] bonds_to_use[(res1, res2)] = [(invmap1[b[0]], invmap2[b[1]]) for b in bond_list] # initialize mol and residue/bond tracking mol = Chem.Mol() residues_added = {} bonds_spent = set() # add residues and get offset in order offset = 0 for r_id in residues_to_add: res = self.monomers[r_id] residues_added[r_id] = offset offset += res.rdkit_mol.GetNumAtoms() mol = Chem.CombineMols(mol, res.rdkit_mol) # add bonds edit_mol = Chem.EditableMol(mol) for bond_key, bond_list in bonds_to_use.items(): if bond_key in bonds_spent: continue r1, r2 = bond_key if r1 in residues_added and r2 in residues_added: bonds_spent.add(bond_key) for bond in bond_list: i, j = bond edit_mol.AddBond( i + residues_added[r1], j + residues_added[r2], order=Chem.rdchem.BondType.SINGLE ) mol = edit_mol.GetMol() # review added bonds and residues if len(bonds_spent) != len(bonds_to_use): raise RuntimeError("nr of bonds added differs from bonds to use") if len(residues_added) != len(residues_to_add): raise RuntimeError("nr of residues added differs from residues to add") return mol
[docs] @classmethod def from_pdb_string( cls, pdb_string, chem_templates, mk_prep, set_template=None, residues_to_delete=None, allow_bad_res=False, bonds_to_delete=None, blunt_ends=None, wanted_altloc=None, default_altloc=None ): """ Construct a Polymer object from a PDB string. Parameters ---------- pdb_string : str The PDB string containing the polymer structure. chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class to construct the polymer. mk_prep : MoleculePreparation An instance of the MoleculePreparation class to construct the polymer. set_template : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names. If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used). residues_to_delete : set[str], optional A set of residue IDs to delete from the polymer. If None, no residues will be deleted. Default is None (no residues will be deleted). allow_bad_res : bool, optional If True, allows residues that do not match templates to be ignored (rdkit_mol will be None). If False, raises an error if any residues do not match templates. Default is False. bonds_to_delete : list[tuple[str, str]], optional A list of tuples of residue IDs to delete bonds between. If None, no bonds will be deleted. Default is None (no bonds will be deleted). blunt_ends : list[tuple[str, int]], optional A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol). If None, no blunt ends will be added. Default is None (no blunt ends will be added). wanted_altloc : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue. default_altloc : str, optional A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified. Returns ------- Polymer An instance of the Polymer class constructed from the PDB string. Raises ------ NotImplementedError If bonds_to_delete includes residue pairs with more than bond2 between them. PolymerCreationError If there are residues that could not be parsed or matched to templates, and allow_bad_res is False. """ tmp_raw_input_mols = cls._pdb_to_residue_mols( pdb_string, wanted_altloc, default_altloc, ) # from here on it duplicates self.from_prody(), but extracting # this out into a function felt like it sacrificed readibility # so I decided to keep the duplication. _delete_residues(residues_to_delete, tmp_raw_input_mols) raw_input_mols = {} res_needed_altloc = [] res_missed_altloc = [] unparsed_res = [] for res_id, stuff in tmp_raw_input_mols.items(): mol, resname, missed_altloc, needed_altloc = stuff if mol is None and missed_altloc: res_missed_altloc.append(res_id) elif mol is None and needed_altloc: res_needed_altloc.append(res_id) elif mol is None: unparsed_res.append(res_id) else: raw_input_mols[res_id] = (mol, resname) bonds = find_inter_mols_bonds(raw_input_mols) if bonds_to_delete is not None: for res1, res2 in bonds_to_delete: popped = () if (res1, res2) in bonds: popped = bonds.pop((res1, res2)) elif (res2, res1) in bonds: popped = bonds.pop((res2, res1)) if len(popped) >= 2: msg = ( "can't delete bonds for residue pairs that have more" " than one bond between them" ) raise NotImplementedError(msg) polymer = cls( raw_input_mols, bonds, chem_templates, mk_prep, set_template, blunt_ends, ) unmatched_res = polymer.get_ignored_monomers() handle_parsing_situations( unmatched_res, unparsed_res, allow_bad_res, res_missed_altloc, res_needed_altloc, ) return polymer
# region adapted from from_pdb_string
[docs] @classmethod def from_pqr_string( cls, pqr_string, chem_templates, mk_prep, set_template=None, residues_to_delete=None, allow_bad_res=False, bonds_to_delete=None, blunt_ends=None, ): """ Construct a Polymer object from a PQR string. Adapted from PDB2PQR. Parameters ---------- pqr_string : str The PQR string containing the polymer structure. chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class to construct the polymer. mk_prep : MoleculePreparation An instance of the MoleculePreparation class to construct the polymer. set_template : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names. If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used). residues_to_delete : set[str], optional A set of residue IDs to delete from the polymer. If None, no residues will be deleted. Default is None (no residues will be deleted). allow_bad_res : bool, optional If True, allows residues that do not match templates to be ignored (rdkit_mol will be None). If False, raises an error if any residues do not match templates. Default is False. bonds_to_delete : list[tuple[str, str]], optional A list of tuples of residue IDs to delete bonds between. If None, no bonds will be deleted. Default is None (no bonds will be deleted). blunt_ends : list[tuple[str, int]], optional A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol). If None, no blunt ends will be added. Default is None (no blunt ends will be added). Returns ------- Polymer An instance of the Polymer class constructed from the PDB string. Raises ------ NotImplementedError If bonds_to_delete includes residue pairs with more than bond2 between them. PolymerCreationError If there are residues that could not be parsed or matched to templates, and allow_bad_res is False. """ tmp_raw_input_mols = cls._pqr_to_residue_mols( pqr_string, ) # from here on it duplicates self.from_prody(), but extracting # this out into a function felt like it sacrificed readibility # so I decided to keep the duplication. _delete_residues(residues_to_delete, tmp_raw_input_mols) raw_input_mols = {} res_needed_altloc = [] res_missed_altloc = [] unparsed_res = [] for res_id, stuff in tmp_raw_input_mols.items(): mol, resname, missed_altloc, needed_altloc = stuff if mol is None and missed_altloc: res_missed_altloc.append(res_id) elif mol is None and needed_altloc: res_needed_altloc.append(res_id) elif mol is None: unparsed_res.append(res_id) else: raw_input_mols[res_id] = (mol, resname) bonds = find_inter_mols_bonds(raw_input_mols) if bonds_to_delete is not None: for res1, res2 in bonds_to_delete: popped = () if (res1, res2) in bonds: popped = bonds.pop((res1, res2)) elif (res2, res1) in bonds: popped = bonds.pop((res2, res1)) if len(popped) >= 2: msg = ( "can't delete bonds for residue pairs that have more" " than one bond between them" ) raise NotImplementedError(msg) polymer = cls( raw_input_mols, bonds, chem_templates, mk_prep, set_template, blunt_ends, get_atomprop_from_raw = {"PQRCharge": 0.}, ) if polymer.log["matched_with_H_anomaly"]: msg = "" for res_id, (template_name, h_info) in polymer.log["matched_with_H_anomaly"].items(): h_miss = h_info.get('H_miss', 0) h_excess = h_info.get('H_excess', 0) msg += f"Residue {res_id} matched with template '{template_name}' has H discrepancy: {h_miss} missing, {h_excess} excess. \n" raise PolymerCreationError(msg + "These discrepancies may compromise the validity of the charge assignment from PQR, making the charges inapplicable to the processed receptor. \n") unmatched_res = polymer.get_ignored_monomers() handle_parsing_situations( unmatched_res, unparsed_res, allow_bad_res, res_missed_altloc, res_needed_altloc, ) return polymer
# endregion
[docs] @classmethod def from_prody( cls, prody_obj: Union[Selection, AtomGroup], chem_templates, mk_prep, set_template=None, residues_to_delete=None, allow_bad_res=False, bonds_to_delete=None, blunt_ends=None, wanted_altloc: Optional[dict]=None, default_altloc: Optional[str]=None, ): """ Construct a Polymer object from a ProDy Selection or AtomGroup object. Parameters ---------- prody_obj : ProDy.Selection or ProDy.AtomGroup The ProDy object to construct the polymer from. chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class to construct the polymer. mk_prep : MoleculePreparation An instance of the MoleculePreparation class to construct the polymer. set_template : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names. If None, no specific templates will be set. Default is None (the built-in ResidueTemplate ambiguious name mapping will be used). residues_to_delete : set[str], optional A set of residue IDs to delete from the polymer. If None, no residues will be deleted. Default is None (no residues will be deleted). allow_bad_res : bool, optional If True, allows residues that do not match templates to be ignored (rdkit_mol will be None). If False, raises an error if any residues do not match templates. Default is False. bonds_to_delete : list[tuple[str, str]], optional A list of tuples of residue IDs to delete bonds between. If None, no bonds will be deleted. Default is None (no bonds will be deleted). blunt_ends : list[tuple[str, int]], optional A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol). If None, no blunt ends will be added. Default is None (no blunt ends will be added). wanted_altloc : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue. default_altloc : str, optional A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified. Returns ------- Polymer An instance of the Polymer class constructed from the PDB string. Raises ------ NotImplementedError If bonds_to_delete includes residue pairs with more than bond2 between them. PolymerCreationError If there are residues that could not be parsed or matched to templates, and allow_bad_res is False. """ tmp_raw_input_mols = cls._prody_to_residue_mols( prody_obj, wanted_altloc, default_altloc, ) # from here on it duplicates self.from_pdb_string(), but extracting # this out into a function felt like it sacrificed readibility # so I decided to keep the duplication. _delete_residues(residues_to_delete, tmp_raw_input_mols) raw_input_mols = {} res_needed_altloc = [] res_missed_altloc = [] unparsed_res = [] for res_id, stuff in tmp_raw_input_mols.items(): mol, resname, missed_altloc, needed_altloc = stuff if mol is None and missed_altloc: res_missed_altloc.append(res_id) elif mol is None and needed_altloc: res_needed_altloc.append(res_id) elif mol is None: unparsed_res.append(res_id) else: raw_input_mols[res_id] = (mol, resname) bonds = find_inter_mols_bonds(raw_input_mols) if bonds_to_delete is not None: for res1, res2 in bonds_to_delete: popped = () if (res1, res2) in bonds: popped = bonds.pop((res1, res2)) elif (res2, res1) in bonds: popped = bonds.pop((res2, res1)) if len(popped) >= 2: msg = ( "can't delete bonds for residue pairs that have more" " than one bond between them" ) raise NotImplementedError(msg) polymer = cls( raw_input_mols, bonds, chem_templates, mk_prep, set_template, blunt_ends, ) unmatched_res = polymer.get_ignored_monomers() handle_parsing_situations( unmatched_res, unparsed_res, allow_bad_res, res_missed_altloc, res_needed_altloc, ) return polymer
[docs] def parameterize(self, mk_prep, get_atomprop_from_raw = None): """ Parameterize the monomers in the polymer using the provided MoleculePreparation instance. Parameters ---------- mk_prep : MoleculePreparation An instance of the MoleculePreparation class to parameterize the monomers. get_atomprop_from_raw : dict, optional A dictionary mapping atom properties to pass from raw_input_mols. The keys are the property names and the values are the default values. Returns ------- None """ for residue_id in self.get_valid_monomers(): self.monomers[residue_id].parameterize(mk_prep, residue_id, get_atomprop_from_raw = get_atomprop_from_raw)
@staticmethod def _build_rdkit_mol(raw_mol, template, mapping, nr_missing_H): """ Build an RDKit molecule from a raw molecule according to a ResidueTemplate. Parameters ---------- raw_mol : Chem.Mol The raw molecule matched to the template. template : ResidueTemplate The template to match the raw molecule to. mapping : dict[int, int] A dictionary mapping atom indices from the raw molecule to the template. nr_missing_H : int The number of missing hydrogen atoms in the raw molecule compared to the template. Returns ------- Chem.Mol The RDKit molecule built from the raw molecule and template. """ rdkit_mol = Chem.Mol(template.mol) # making a copy conf = Chem.Conformer(rdkit_mol.GetNumAtoms()) input_conf = raw_mol.GetConformer() for i, j in mapping.items(): conf.SetAtomPosition(i, input_conf.GetAtomPosition(j)) rdkit_mol.AddConformer(conf, assignId=True) if nr_missing_H: # add positions to Hs missing in raw_mol if rdkit_mol.GetNumAtoms() != len(mapping) + nr_missing_H: raise RuntimeError( f"nr of atoms ({rdkit_mol.GetNumAtoms()}) != " f"{len(mapping)=} + {nr_missing_H=}" ) idxs = [i for i in range(rdkit_mol.GetNumAtoms()) if i not in mapping] update_H_positions(rdkit_mol, idxs) return rdkit_mol @staticmethod def _get_best_missing_Hs(results): """ Get the best results based on the number of missing H atoms. Parameters ---------- results : list[dict] A list of dictionaries containing the results of matching raw molecules to templates. Returns ------- best_idxs : list[int] A list of indices of the best results based on the number of missing H atoms. fail_log : list[list[str]] A list of lists containing failure messages for each result. """ min_missing_H = 999999 best_idxs = [] fail_log = [] for i, result in enumerate(results): fail_log.append([]) if result["heavy"]["missing"] > 0: fail_log[-1].append("heavy missing") if result["heavy"]["excess"] > 0: fail_log[-1].append("heavy excess") if len(result["H"]["excess"]) > 0: fail_log[-1].append("H excess") if len(result["bonds"]["excess"]) > 0: fail_log[-1].append("bonds excess") if len(result["bonds"]["missing"]) > 0: fail_log[-1].append(f"bonds missing at {result['bonds']['missing']}") if len(fail_log[-1]): continue if result["H"]["missing"] < min_missing_H: best_idxs = [] min_missing_H = result["H"]["missing"] if result["H"]["missing"] == min_missing_H: best_idxs.append(i) return best_idxs, fail_log @classmethod def _get_monomers( cls, raw_input_mols, ambiguous, residue_chem_templates, set_template, bonds, blunt_ends, ): """ Process, parameterize and populate monomers of a Polymer from raw_input_mols and residue_chem_templates. Parameters ---------- raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names. ambiguous : dict[str, list[str]] Mapping between input residue names (e.g. the three-letter residue name from PDB files) and IDs (strings) of ResidueTemplates. residue_chem_templates : ResidueChemTemplates An instance of the ResidueChemTemplates class to construct the polymer. set_template : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the user-specified ResidueTemplate names. bonds : dict[tuple[str], list[tuple[int]]] A dictionary of bonds between residues. blunt_ends : list[tuple[str, int]], optional A list of tuples where each tuple is a residue ID and a 0-based atom index (in raw_mol). If None, no blunt ends will be added. Default is None (no blunt ends will be added). Returns ------- monomers : dict[str, Monomer] A dictionary mapping residue IDs to Monomer objects. log : dict A dictionary containing logs of the matching process, including information about chosen templates, missing atoms, and unmatched residues. """ residue_templates = residue_chem_templates.residue_templates monomers = {} log = { "chosen_by_fewest_missing_H": {}, "chosen_by_default": {}, "matched_with_H_anomaly": {}, "matched_with_excess_bond": [], "no_match": [], "no_mol": [], "msg": "", } for residue_key, (raw_mol, input_resname) in raw_input_mols.items(): if raw_mol is None: monomers[residue_key] = Monomer( None, None, None, input_resname, None ) log["no_mol"].append(residue_key) logger.warning(f"molecule for {residue_key=} is None") continue raw_mol_has_H = sum([a.GetAtomicNum() == 1 for a in raw_mol.GetAtoms()]) > 0 excess_H_ok = False if set_template is not None and residue_key in set_template: excess_H_ok = True # e.g. allow set LYN (NH2) from LYS (NH3+) template_key = set_template[residue_key] # e.g. HID, NALA if template_key not in residue_templates: if template_key in ambiguous: raise RuntimeError(f"Can't assign an ambiguous tamplate_key ({template_key}) to residue ({residue_key}). ") raise RuntimeError(f"Assigned tamplate_key ({template_key}) for residue ({residue_key}) is not in residue_templates. ") template = residue_templates[template_key] candidate_template_keys = [set_template[residue_key]] candidate_templates = [template] elif input_resname not in ambiguous: template_key = input_resname template = residue_templates[template_key] candidate_template_keys = [template_key] candidate_templates = [template] elif len(ambiguous[input_resname]) == 1: template_key = ambiguous[input_resname][0] template = residue_templates[template_key] candidate_template_keys = [template_key] candidate_templates = [template] else: candidate_template_keys = [] candidate_templates = [] for key in ambiguous[input_resname]: template = residue_templates[key] candidate_templates.append(template) candidate_template_keys.append(key) # gather raw_mol atoms that have bonds or blunt ends if blunt_ends is None: blunt_ends = [] raw_atoms_with_bonds = [] for (r1, r2), bond_list in bonds.items(): for i, j in bond_list: if r1 == residue_key: raw_atoms_with_bonds.append(i) if r2 == residue_key: raw_atoms_with_bonds.append(j) all_stats = { "heavy_missing": [], "heavy_excess": [], "H_excess": [], "H_missing": [], "bonded_atoms_missing": [], "bonded_atoms_excess": [], } mappings = [] for index, template in enumerate(candidate_templates): # match intra-residue graph match_stats, mapping = template.match(raw_mol) mappings.append(mapping) # match inter-residue bonds atoms_with_bonds = set() from_raw = {value: key for (key, value) in mapping.items()} for raw_index in raw_atoms_with_bonds: if raw_index in from_raw: # bonds can occur on atoms the template does not have atom_index = from_raw[raw_index] atoms_with_bonds.add(atom_index) # we treat blunt ends like bonds for res_id, atom_idx in blunt_ends: if res_id == residue_key: atoms_with_bonds.add(from_raw[atom_idx]) expected = set(template.link_labels) bonded_atoms_found = atoms_with_bonds.intersection(expected) bonded_atoms_missing = expected.difference(atoms_with_bonds) bonded_atoms_excess = atoms_with_bonds.difference(expected) all_stats["heavy_missing"].append(match_stats["heavy"]["missing"]) all_stats["heavy_excess"].append(match_stats["heavy"]["excess"]) all_stats["H_excess"].append(match_stats["H"]["excess"]) all_stats["H_missing"].append(match_stats["H"]["missing"]) all_stats["bonded_atoms_missing"].append(bonded_atoms_missing) all_stats["bonded_atoms_excess"].append(bonded_atoms_excess) passed = [] embedded_indices = [index for index, template in enumerate(candidate_templates) if len(template.link_labels) >= 2] # 1st round for i in embedded_indices: if ( all_stats["heavy_missing"][i] or all_stats["heavy_excess"][i] or all_stats["H_excess"][i] or all_stats["bonded_atoms_missing"][i] or len(all_stats["bonded_atoms_excess"][i]) ): continue passed.append(i) # 2nd round if len(passed) == 0: for i in embedded_indices: auto_blunt = set() for j, padder_label in candidate_templates[i].link_labels.items(): if residue_chem_templates.padders[padder_label].auto_blunt: auto_blunt.add(j) if ( all_stats["heavy_missing"][i] or all_stats["heavy_excess"][i] or (not set(all_stats["H_excess"][i]) <= set(candidate_templates[i].link_labels) and not excess_H_ok) or not all_stats["bonded_atoms_missing"][i] <= auto_blunt ): continue passed.append(i) # 3rd round if len(passed) == 0 or any(all_stats["H_excess"][i] for i in passed): for i in range(len(candidate_templates)): if ( all_stats["heavy_missing"][i] or all_stats["heavy_excess"][i] or (all_stats["H_excess"][i] and not excess_H_ok) or len(all_stats["bonded_atoms_missing"][i]) ): continue if i not in passed: passed.append(i) if len(passed) == 0: template_key = None template = None mapping = None m = f"No template matched for {residue_key=}" + eol m += f"tried {len(candidate_templates)} templates for {residue_key=}" m += f"{excess_H_ok=}" m += eol for i in range(len(all_stats["H_excess"])): heavy_miss = all_stats["heavy_missing"][i] heavy_excess = all_stats["heavy_excess"][i] H_excess = all_stats["H_excess"][i] bond_miss = all_stats["bonded_atoms_missing"][i] bond_excess = all_stats["bonded_atoms_excess"][i] tkey = candidate_template_keys[i] m += ( f"{tkey:10} {heavy_miss=} {heavy_excess=} {H_excess=} {bond_miss=} {bond_excess=}" + eol ) logger.warning(m) elif len(passed) == 1 or not raw_mol_has_H: index = passed[0] template_key = candidate_template_keys[index] template = candidate_templates[index] mapping = mappings[index] H_miss = all_stats["H_missing"][index] else: min_missing_H = 999999 for i, index in enumerate(passed): H_missed = all_stats["H_missing"][index] if H_missed < min_missing_H: best_idxs = [] min_missing_H = H_missed if H_missed == min_missing_H: best_idxs.append(index) if len(best_idxs) > 1: number_excess_H = [len(all_stats["H_excess"][index]) for index in passed] min_excess_H = min(number_excess_H) best_idxs = [index for index in passed if len(all_stats["H_excess"][index]) == min_excess_H] if len(best_idxs) > 1: tied = " ".join(candidate_template_keys[i] for i in best_idxs) m = f"for {residue_key=}, {len(passed)} have passed: " tkeys = [candidate_template_keys[i] for i in passed] m += f"{tkeys} and tied for fewest missing and excess H: {tied} " raise RuntimeError(m) index = best_idxs[0] template_key = candidate_template_keys[index] template = residue_templates[template_key] mapping = mappings[index] H_miss = all_stats["H_missing"][index] log["chosen_by_fewest_missing_H"][residue_key] = template_key H_miss = all_stats["H_missing"][index] H_excess = all_stats["H_excess"][index] if H_miss or H_excess: log["matched_with_H_anomaly"][residue_key] = ( template_key, {"H_miss": H_miss, "H_excess": len(H_excess)} ) bond_excess = all_stats["bonded_atoms_excess"][index] if bond_excess: log["matched_with_excess_bond"].append(residue_key) logger.warning(f"matched with excess inter-residue bond(s): {residue_key}") if template is None: rdkit_mol = None atom_names = None mapping = None else: rdkit_mol = cls._build_rdkit_mol( raw_mol, template, mapping, H_miss, ) atom_names = template.atom_names monomers[residue_key] = Monomer( raw_mol, rdkit_mol, mapping, input_resname, template_key, atom_names, ) monomers[residue_key].template = template if template is not None and template.link_labels is not None: mapping_inv = monomers[ residue_key ].mapidx_from_raw # {j: i for (i, j) in mapping.items()} # TODO check here mapping_inv unnused link_labels = {i: label for i, label in template.link_labels.items()} monomers[residue_key].link_labels = link_labels return monomers, log @staticmethod def _build_padded_mols(monomers, bonds, padders): """ Build padded molecules from monomers and bonds. Parameters ---------- monomers : dict[str, Monomer] A dictionary mapping residue IDs to Monomer objects. bonds : dict[tuple[str], list[tuple[int]]] A dictionary of bonds between residues. padders : dict[str, function] A dictionary mapping link labels to functions that pad the molecules. Returns ------- padded_mols : dict[str, tuple[Chem.Mol, dict[int, int]]] A dictionary mapping residue IDs to tuples of padded molecules and their corresponding mapping dictionaries. """ padded_mols = {} bond_use_count = {key: 0 for key in bonds} for ( residue_id, monomer, ) in monomers.items(): if monomer.rdkit_mol is None: continue padded_mol = monomer.rdkit_mol mapidx_pad = { atom.GetIdx(): atom.GetIdx() for atom in padded_mol.GetAtoms() } for atom_index, link_label in monomer.link_labels.items(): adjacent_rid = None adjacent_mol = None adjacent_atom_index = None for (r1_id, r2_id), bond_list in bonds.items(): # TODO the second and subsequent bonds between a pair of # residues will not update the padding atoms with the # positions of the adjacent residues. This is OK, the same # happens for blunt residues, because the adjacent residue # is missing. i1, i2 = bond_list[0] if r1_id == residue_id and i1 == atom_index: adjacent_rid = r2_id adjacent_atom_index = i2 break elif r2_id == residue_id and i2 == atom_index: adjacent_rid = r1_id adjacent_atom_index = i1 break if adjacent_rid is not None: adjacent_mol = monomers[adjacent_rid].rdkit_mol bond_use_count[(r1_id, r2_id)] += 1 padded_mol, mapidx = padders[link_label]( padded_mol, adjacent_mol, atom_index, adjacent_atom_index ) tmp = {} for i, j in enumerate(mapidx): if j is None: continue # new padding atom if j not in mapidx_pad: continue # padding atom from previous iteration for another link_label tmp[i] = mapidx_pad[j] mapidx_pad = tmp # update position of hydrogens bonded to link atoms inv = {j: i for (i, j) in mapidx_pad.items()} padded_idxs_to_update = [] no_pad_idxs_to_update = [] for atom_index in monomer.link_labels: heavy_atom = monomer.rdkit_mol.GetAtomWithIdx(atom_index) for neighbor in heavy_atom.GetNeighbors(): if neighbor.GetAtomicNum() != 1: continue if neighbor.GetIdx() in monomer.mapidx_to_raw: # index of H exists in mapidx_to_raw, which means that # the raw_input_mol had the hydrogen. Thus, we do not # want to update its coordiantes. continue no_pad_idxs_to_update.append(neighbor.GetIdx()) padded_idxs_to_update.append(inv[neighbor.GetIdx()]) update_H_positions(padded_mol, padded_idxs_to_update) source = padded_mol.GetConformer() destination = monomer.rdkit_mol.GetConformer() for i, j in zip(no_pad_idxs_to_update, padded_idxs_to_update): destination.SetAtomPosition(i, source.GetAtomPosition(j)) # can invert chirality in 3D positions padded_mols[residue_id] = (padded_mol, mapidx_pad) # verify that all bonds resulted in padding err_msg = "" for key, count in bond_use_count.items(): if count != 2: err_msg += ( f"expected two paddings for {key} {bonds[key]}, padded {count}" + eol ) if len(err_msg): raise RuntimeError(err_msg) return padded_mols
[docs] def flexibilize_sidechain(self, residue_id, mk_prep): """ Set the sidechain of a residue as flexible. The Monomer must have been processed and have valid attributes before calling this method. Parameters ---------- residue_id : str The ID of the residue to be made flexible. mk_prep : MoleculePreparation An instance of the MoleculePreparation class to construct the polymer. Returns ------- None """ monomer = self.monomers[residue_id] inv = {j: i for i, j in monomer.molsetup_mapidx.items()} link_atoms = [inv[i] for i in monomer.template.link_labels] if len(link_atoms) == 0: raise RuntimeError( "can't define a sidechain without bonds to other residues" ) # TODO: rewrite this to work better with new MoleculeSetups graph = {atom.index: atom.graph for atom in monomer.molsetup.atoms} for i in range(len(link_atoms) - 1): start_node = link_atoms[i] end_nodes = [k for (j, k) in enumerate(link_atoms) if j != i] backbone_paths = find_graph_paths(graph, start_node, end_nodes) for path in backbone_paths: for x in range(len(path) - 1): idx1 = min(path[x], path[x + 1]) idx2 = max(path[x], path[x + 1]) monomer.molsetup.bond_info[(idx1, idx2)].rotatable = False monomer.is_movable = True mk_prep.calc_flex( monomer.molsetup, root_atom_index=link_atoms[0], ) molsetup = monomer.molsetup is_rigid_atom = [False for _ in molsetup.atoms] graph = molsetup.flexibility_model["rigid_body_graph"] root_body_idx = molsetup.flexibility_model["root"] conn = molsetup.flexibility_model["rigid_body_connectivity"] rigid_index_by_atom = molsetup.flexibility_model["rigid_index_by_atom"] # from the root, use only the atom that is bonded to the only rotatable bond for other_body_idx in graph[root_body_idx]: root_link_atom_idx = conn[(root_body_idx, other_body_idx)][0] for atom_idx, body_idx in rigid_index_by_atom.items(): if body_idx != root_body_idx or atom_idx == root_link_atom_idx: monomer.is_flexres_atom[atom_idx] = True return
@staticmethod def _add_if_new(to_dict, key, value, repeat_log): if key in to_dict: repeat_log.add(key) else: to_dict[key] = value return @staticmethod def _pdb_to_residue_mols( pdb_string, wanted_altloc: Optional[dict[str, str]]=None, default_altloc: Optional[str]=None, ): """ Convert a PDB string to residue molecules. Parameters ---------- pdb_string : str The PDB string to convert. wanted_altloc : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue. default_altloc : str, optional A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified. Returns ------- raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names. Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs. Raises ------ ValueError If there are interrupted residues or if each residue key does not have exactly one resname. """ blocks_by_residue = {} reskey_to_resname = {} reskey = None buffered_reskey = None buffered_resname = None # residues in non-consecutive lines due to TER or another res interrupted_residues = set() pdb_block = [] for line in pdb_string.splitlines(True): if line.startswith("TER") and reskey is not None: Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues) blocks_by_residue[reskey] = pdb_block pdb_block = [] reskey = None buffered_reskey = None if line.startswith("ATOM") or line.startswith("HETATM"): atomname = line[12:16].strip() altloc = line[16:17].strip() resname = line[17:20].strip() chainid = line[21:22].strip() resnum = int(line[22:26].strip()) icode = line[26:27].strip() x = float(line[30:38]) y = float(line[38:46]) z = float(line[46:54]) element = line[76:78].strip() reskey = f"{chainid}:{resnum}{icode}" # e.g. ":42", "A:42B" reskey_to_resname.setdefault(reskey, set()) reskey_to_resname[reskey].add(resname) atom = AtomField( atomname, altloc, resname, chainid, resnum, icode, x, y, z, element, ) if reskey == buffered_reskey: # this line continues existing residue pdb_block.append(atom) else: if buffered_reskey is not None: Polymer._add_if_new( blocks_by_residue, buffered_reskey, pdb_block, interrupted_residues, ) buffered_reskey = reskey pdb_block = [atom] if pdb_block: # there was not a TER line Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues) if interrupted_residues: msg = f"interrupted residues in PDB: {interrupted_residues}" raise ValueError(msg) # verify that each identifier (e.g. "A:17" has a single resname violations = {k: v for k, v in reskey_to_resname.items() if len(v) != 1} if len(violations): msg = "each residue key must have exactly 1 resname" + eol msg += f"but got {violations=}" raise ValueError(msg) if wanted_altloc is None: wanted_altloc = {} raw_input_mols = {} for reskey, atom_field_list in blocks_by_residue.items(): requested_altloc = wanted_altloc.get(reskey, None) pdbmol, _, missed_altloc, needed_altloc = _aux_altloc_mol_build( atom_field_list, requested_altloc, default_altloc, ) resname = list(reskey_to_resname[reskey])[0] # verified length 1 raw_input_mols[reskey] = (pdbmol, resname, missed_altloc, needed_altloc) return raw_input_mols @staticmethod def _pqr_to_residue_mols( pqr_string ): """ Convert a PQR string to residue molecules. Parameters ---------- pqr_string : str The PQR string to convert. Returns ------- raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names. Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs. Raises ------ ValueError If there are interrupted residues or if each residue key does not have exactly one resname. """ blocks_by_residue = {} blocks_qr = {} reskey_to_resname = {} reskey = None buffered_reskey = None # residues in non-consecutive lines due to TER or another res interrupted_residues = set() pdb_block = [] def get_pqr_atom_items(pqr_line): """based on pdb2pqr.structures.Atom.from_pqr_line""" items = [w.strip() for w in pqr_line.split()] token = items.pop(0) if token in [ "REMARK", "TER", "END", "HEADER", "TITLE", "COMPND", "SOURCE", "KEYWDS", "EXPDTA", "AUTHOR", "REVDAT", "JRNL", ]: return None elif token in ["ATOM", "HETATM"]: return items elif token[:4] == "ATOM": return token[4:] + items elif token[:6] == "HETATM": return token[6:] + items else: err = f"Unable to parse PQR line: {pqr_line}" raise ValueError(err) def atom_from_pqr_items(atom_pqr_items: list[str]) -> tuple[AtomField, float]: """get AtomField from items read from a PQR line""" if not atom_pqr_items: return None atom_serial = int(atom_pqr_items.pop(0)) # Meeko doesn't need atom_serial (ID) atomname = atom_pqr_items.pop(0) element = next((char for char in atomname if char.isalpha()), None) if element is None: err = f"Unable to parse element from PQR atomname: {atomname}" raise ValueError(err) element = element.upper() altloc = "" # PQR doesn't have altloc resname = atom_pqr_items.pop(0) token = atom_pqr_items.pop(0) chainid = "" # Optional in PQR try: resnum = int(token) # Must be int in PQR except ValueError: chainid = token resnum = int(atom_pqr_items.pop(0)) token = atom_pqr_items.pop(0) icode = "" # Optional in PQR try: x = float(token) except ValueError: icode = token x = float(atom_pqr_items.pop(0)) y = float(atom_pqr_items.pop(0)) z = float(atom_pqr_items.pop(0)) charge = float(atom_pqr_items.pop(0)) radius = float(atom_pqr_items.pop(0)) return ( AtomField( atomname, altloc, resname, chainid, resnum, icode, x, y, z, element, ), charge, radius, ) # region adapted from _pdb_to_residue_mols for line in pqr_string.splitlines(True): pqr_items = get_pqr_atom_items(line) if pqr_items is None and reskey is not None: Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues) blocks_by_residue[reskey] = pdb_block blocks_qr[reskey] = block_qr pdb_block = [] block_qr = [] reskey = None buffered_reskey = None if pqr_items: atom, pqr_charge, pqr_radius = atom_from_pqr_items(pqr_items) reskey = f"{atom.chain}:{atom.resnum}{atom.icode}" resname = atom.resname reskey_to_resname.setdefault(reskey, set()) reskey_to_resname[reskey].add(resname) if reskey == buffered_reskey: # this line continues existing residue pdb_block.append(atom) block_qr.append((pqr_charge, pqr_radius)) else: if buffered_reskey is not None: Polymer._add_if_new( blocks_by_residue, buffered_reskey, pdb_block, interrupted_residues, ) blocks_qr[buffered_reskey] = block_qr buffered_reskey = reskey pdb_block = [atom] block_qr = [(pqr_charge, pqr_radius)] if pdb_block: # there was not a TER line Polymer._add_if_new(blocks_by_residue, reskey, pdb_block, interrupted_residues) blocks_qr[reskey] = block_qr if interrupted_residues: msg = f"interrupted residues in PDB: {interrupted_residues}" raise ValueError(msg) # verify that each identifier (e.g. "A:17" has a single resname violations = {k: v for k, v in reskey_to_resname.items() if len(v) != 1} if len(violations): msg = "each residue key must have exactly 1 resname" + eol msg += f"but got {violations=}" raise ValueError(msg) # endregion raw_input_mols = {} # PQR shouldn't have altlocs requested_altloc = None default_altloc = "" for reskey, atom_field_list in blocks_by_residue.items(): requested_altloc = None pdbmol, _, missed_altloc, needed_altloc = _aux_altloc_mol_build( atom_field_list, requested_altloc, default_altloc, ) for atom, pqr_prop in zip(pdbmol.GetAtoms(), blocks_qr[reskey]): atom.SetDoubleProp("PQRCharge", pqr_prop[0]) atom.SetDoubleProp("PQRRadius", pqr_prop[1]) resname = list(reskey_to_resname[reskey])[0] # verified length 1 raw_input_mols[reskey] = (pdbmol, resname, missed_altloc, needed_altloc) return raw_input_mols @staticmethod def _prody_to_residue_mols( prody_obj: ALLOWED_PRODY_TYPES, wanted_altloc_dict: Optional[dict] = None, default_altloc: Optional[str] = None, ) -> dict: """ Convert a ProDy object to residue molecules. Parameters ---------- prody_obj : ALLOWED_PRODY_TYPES The ProDy object to convert (can be one of the allowed types in .utils.prodyutils.ALLOWED_PRODY_TYPES). wanted_altloc_dict : dict[str, str], optional A dictionary mapping residue IDs in the format <chain>:<resnum> such as "A:42" to the desired alternate location (altloc) for that residue. default_altloc : str, optional A string representing the default alternate location (altloc) to be used for residues that do not have a specific altloc specified. Returns ------- raw_input_mols : dict[str, tuple[Chem.Mol, str]] A dictionary mapping residue IDs to tuples of raw molecules and their corresponding residue names. Each tuple contains the raw molecule, the residue name, a list of missing altlocs, and a list of needed altlocs. """ if wanted_altloc_dict is None: wanted_altloc_dict = {} raw_input_mols = {} reskey_to_resname = {} # generate macromolecule hierarchy iterator hierarchy = prody_obj.getHierView() # iterate chains for chain in hierarchy.iterChains(): # iterate residues for res in chain.iterResidues(): # gather residue info chain_id = str(res.getChid()).strip() res_name = str(res.getResname()).strip() res_num = int(res.getResnum()) icode = str(res.getIcode()).strip() reskey = f"{chain_id}:{res_num}{icode}" reskey_to_resname.setdefault(reskey, set()) reskey_to_resname[reskey].add(res_name) requested_altloc = wanted_altloc_dict.get(reskey, None) # we are not sanitizing because protonated LYS don't have the # formal charge set on the N and Chem.SanitizeMol raises error # Chem.SanitizeMol(prody_mol) prody_mol, missed_altloc, needed_altloc = prody_to_rdkit( res, sanitize=False, requested_altloc=requested_altloc, default_altloc=default_altloc, ) raw_input_mols[reskey] = (prody_mol, res_name, missed_altloc, needed_altloc) return raw_input_mols
[docs] def to_pdb(self, new_positions: Optional[dict]=None): """ Convert the polymer to a PDB string while (optionally) updating the coordinates of specified monomers. Parameters ---------- new_positions: dict[str, dict[int, tuple[float, float, float]]], optional A dictionary mapping residue IDs to dictionaries of atom indices and their new coordinates. Returns ------- pdb_string: str The PDB string representation of the polymer. Raises ------ ValueError If any residue IDs in new_positions are not valid monomers. """ if new_positions is None: new_positions = {} valid_monomers = self.get_valid_monomers() # check that residue IDs passed in new_positions are valid unknown_res_ids = set() for res_id in new_positions: if res_id not in valid_monomers: unknown_res_ids.add(res_id) if unknown_res_ids: msg = f"Residue IDs not in valid monomers: {unknown_res_ids}" raise ValueError(msg) pdbout = "" atom_count = 0 pdb_line = "{:6s}{:5d} {:^4s} {:3s} {:1s}{:4d}{:1s} {:8.3f}{:8.3f}{:8.3f} {:>2s} " pdb_line += eol for res_id in self.get_valid_monomers(): rdkit_mol = self.monomers[res_id].rdkit_mol if res_id in new_positions: positions = get_updated_positions( self.monomers[res_id], new_positions[res_id], ) else: positions = rdkit_mol.GetConformer().GetPositions() chain, resnum = res_id.split(":") if resnum[-1].isalpha(): icode = resnum[-1] resnum = resnum[:-1] else: icode = "" resnum = int(resnum) for i, atom in enumerate(rdkit_mol.GetAtoms()): atom_count += 1 props = atom.GetPropsAsDict() atom_name = self.monomers[res_id].atom_names[i] x, y, z = positions[i] element = mini_periodic_table[atom.GetAtomicNum()] pdbout += pdb_line.format( "ATOM", atom_count, atom_name, self.monomers[res_id].input_resname, chain, resnum, icode, x, y, z, element, ) return pdbout
[docs] def export_static_atom_params(self): """ Export static atom parameters from the polymer. Returns ------- atom_params: dict A dictionary containing atom parameters. coords: list A list of coordinates for the atoms. """ atom_params = {} counter_atoms = 0 coords = [] dedicated_attribute = ( "charge", "atom_type", ) # molsetup has a dedicated attribute for res_id in self.get_valid_monomers(): molsetup = self.monomers[res_id].molsetup wanted_atom_indices = [] for atom in molsetup.atoms: if not atom.is_ignore and not self.monomers[res_id].is_flexres_atom[atom.index]: wanted_atom_indices.append(atom.index) coords.append(molsetup.get_coord(atom.index)) for key, values in molsetup.atom_params.items(): atom_params.setdefault(key, [None] * counter_atoms) # add new "column" for i in wanted_atom_indices: atom_params[key].append(values[i]) # This was reworked to specifically address the new MoleculeSetup structure. Needs re-thinking charge_dict = {atom.index: atom.charge for atom in molsetup.atoms} atom_type_dict = {atom.index: atom.atom_type for atom in molsetup.atoms} for key in dedicated_attribute: atom_params.setdefault(key, [None] * counter_atoms) # add new "column" if key == "charge": values_dict = charge_dict else: values_dict = atom_type_dict for i in wanted_atom_indices: atom_params[key].append(values_dict[i]) counter_atoms += len(wanted_atom_indices) added_keys = set(molsetup.atom_params).union(dedicated_attribute) for key in set(atom_params).difference( added_keys ): # <key> missing in current molsetup atom_params[key].extend( [None] * len(wanted_atom_indices) ) # fill in incomplete "row" if hasattr(self, "param_rename"): # e.g. "gasteiger" -> "q" for key, new_key in self.param_rename.items(): atom_params[new_key] = atom_params.pop(key) return atom_params, coords
# region Filtering Residues
[docs] def get_ignored_monomers(self): """ Get monomers that are ignored. Returns ------- dict[str, Monomer] A dictionary of ignored monomers. The keys are the residue IDs and the values are the corresponding Monomer objects. """ return {k: v for k, v in self.monomers.items() if v.rdkit_mol is None}
[docs] def get_valid_monomers(self): """ Get monomers that are valid (not ignored). Returns ------- dict[str, Monomer] A dictionary of valid monomers. The keys are the residue IDs and the values are the corresponding Monomer objects. """ return {k: v for k, v in self.monomers.items() if v.rdkit_mol is not None}
# endregion
[docs] def add_rotamers_to_polymer_molsetups(rotamer_states_list, polymer): """ Add rotamer states to the monomers' molecule setups in a polymer and get the state indices. Parameters ---------- rotamer_states_list : list[dict[str, list[float]]] A list of dictionaries, where each dictionary contains residue IDs as keys and lists of angles as values. polymer : Polymer The polymer object to which the rotamer states will be added. Returns ------- state_indices_list : list[dict[str, int]] A list of dictionaries, where each dictionary contains residue IDs as keys and the corresponding rotamer state indices as values. """ rotamer_res_disambiguate = {} for ( primary_res, specific_res_list, ) in polymer.residue_chem_templates.ambiguous.items(): for specific_res in specific_res_list: rotamer_res_disambiguate[specific_res] = primary_res no_resname_to_resname = {} for res_with_resname in polymer.monomers: chain, resname, resnum = res_with_resname.split(":") no_resname_key = f"{chain}:{resnum}" if no_resname_key in no_resname_to_resname: errmsg = "both %s and %s would be keyed by %s" % ( res_with_resname, no_resname_to_resname[no_resname_key], no_resname_key, ) raise RuntimeError(errmsg) no_resname_to_resname[no_resname_key] = res_with_resname state_indices_list = [] for state_index, state_dict in enumerate(rotamer_states_list): logger.info(f"adding rotamer state {state_index + 1}") state_indices = {} for res_no_resname, angles in state_dict.items(): res_with_resname = no_resname_to_resname[res_no_resname] if polymer.monomers[res_with_resname].molsetup is None: raise RuntimeError( "no molsetup for %s, can't add rotamers" % (res_with_resname) ) # next block is inefficient for large rotamer_states_list # refactored polymers could help by having the following # data readily available molsetup = polymer.monomers[res_with_resname].molsetup name_to_molsetup_idx = {} for atom in molsetup.atoms: atom_name = atom.pdbinfo.name name_to_molsetup_idx[atom_name] = atom.index resname = res_with_resname.split(":")[1] resname = rotamer_res_disambiguate.get(resname, resname) atom_names = residues_rotamers[resname] if len(atom_names) != len(angles): raise RuntimeError( f"expected {len(atom_names)} angles for {resname}, got {len(angles)}" ) atom_idxs = [] for names in atom_names: tmp = [name_to_molsetup_idx[name] for name in names] atom_idxs.append(tmp) state_indices[res_with_resname] = len(molsetup.rotamers) molsetup.add_rotamer(atom_idxs, np.radians(angles)) state_indices_list.append(state_indices) return state_indices_list
[docs] class Monomer(BaseJSONParsable): """ Individual subunit in a Polymer. Often called residue. Attributes ---------- raw_rdkit_mol : Chem.Mol An RDKit Mol that defines element and connectivity within a residue. Bond orders and formal charges may be incorrect, and hydrogens may be missing. This molecule may originate from a PDB string and it defines also the positions of the atoms. rdkit_mol : Chem.Mol Copy of the molecule from a ResidueTemplate, with positions from raw_rdkit_mol. All hydrogens are real atoms except for those at connections with adjacent residues. mapidx_to_raw : dict[int, int] Mapping of atom indices in rdkit_mol to raw_rdkit_mol. residue_template_key : str The matched residue template key of this Monomer. input_resname : str The input residue name of this Monomer. atom_names : list[str] List of atom names in the same order as rdkit_mol. padded_mol : Chem.Mol Padded molecule with additional atoms around link atoms from adjacent residues. molsetup : RDKitMoleculeSetup An RDKitMoleculeSetup associated with this residue. molsetup_mapidx : dict[int, int] Mapping of atom indices in padded_mol to rdkit_mol. is_flexres_atom : list[bool] List indicating whether each atom is a flexible residue atom. is_movable : bool Indicates whether the residue is movable. mapidx_from_raw : dict[int, int] Mapping of atom indices in raw_rdkit_mol to rdkit_mol. template : ResidueTemplate provides access to link_labels in the template """ def __init__( self, raw_input_mol, rdkit_mol, mapidx_to_raw, input_resname=None, template_key=None, atom_names=None, ): """ Initialize a Monomer instance. Parameters ---------- raw_input_mol : Chem.Mol An RDKit Mol that defines element and connectivity within a residue. Bond orders and formal charges may be incorrect, and hydrogens may be missing. rdkit_mol : Chem.Mol Copy of the molecule from a ResidueTemplate, with positions from raw_rdkit_mol. All hydrogens are real atoms except for those at connections with adjacent residues. mapidx_to_raw : dict[int, int] Mapping of atom indices in rdkit_mol to raw_rdkit_mol. input_resname : str, optional The input residue name of this Monomer. template_key : str, optional The matched residue template key of this Monomer. atom_names : list[str], optional List of atom names in the same order as rdkit_mol. """ # Initializer attributes self.raw_rdkit_mol = raw_input_mol self.rdkit_mol = rdkit_mol self.mapidx_to_raw = mapidx_to_raw self.residue_template_key = template_key self.input_resname = input_resname # exists even in openmm topology self.atom_names = ( atom_names # same order as atoms in rdkit_mol, used in rotamers ) # (JSON-bound) computed attributes self.padded_mol = None self.molsetup = None self.molsetup_mapidx = None self.is_flexres_atom = None # Check about these data types/Do we want the default to be None or empty self.is_movable = False self.mapidx_from_raw = self._invert_mapping(self.mapidx_to_raw) # (JSON-unbound) computed attributes # TODO convert link indices/labels in template to rdkit_mol indices herein # self.link_labels = {} self.template = None @staticmethod def _invert_mapping(mapping): if mapping is None: return None inverted = {} for key, value in mapping.items(): if value in inverted: raise RuntimeError(f"Mapping is not invertible: {mapping}") inverted[value] = key return inverted # region JSON-interchange functions
[docs] @classmethod def json_encoder(cls, obj: "Monomer") -> Optional[dict[str, Any]]: try: molsetup = serialize_optional(RDKitMoleculeSetup.json_encoder, obj.molsetup) except KeyError: molsetup = serialize_optional(MoleculeSetup.json_encoder, obj.molsetup) return { "raw_rdkit_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.raw_rdkit_mol), "rdkit_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.rdkit_mol), "mapidx_to_raw": obj.mapidx_to_raw, "residue_template_key": obj.residue_template_key, "input_resname": obj.input_resname, "atom_name": obj.atom_names, "mapidx_from_raw": obj.mapidx_from_raw, "padded_mol": serialize_optional(rdMolInterchange.MolToJSON, obj.padded_mol), "molsetup": molsetup, "is_flexres_atom": obj.is_flexres_atom, "is_movable": obj.is_movable, "molsetup_mapidx": obj.molsetup_mapidx, }
# Keys to check for deserialized JSON expected_json_keys = frozenset({ "raw_rdkit_mol", "rdkit_mol", "mapidx_to_raw", "residue_template_key", "input_resname", "atom_name", "padded_mol", "molsetup", "molsetup_mapidx", "is_flexres_atom", "is_movable", "mapidx_from_raw", }) @classmethod def _decode_object(cls, obj: dict[str, Any]): raw_rdkit_mol = rdkit_mol_from_json(obj["raw_rdkit_mol"]) rdkit_mol = rdkit_mol_from_json(obj["rdkit_mol"]) padded_mol = rdkit_mol_from_json(obj["padded_mol"]) molsetup = RDKitMoleculeSetup.from_dict(obj["molsetup"]) if not isinstance(molsetup, RDKitMoleculeSetup): molsetup = MoleculeSetup.from_dict(obj["molsetup"]) mapidx_to_raw = convert_to_int_keyed_dict(obj["mapidx_to_raw"]) molsetup_mapidx = convert_to_int_keyed_dict(obj["molsetup_mapidx"]) mapidx_from_raw = convert_to_int_keyed_dict(obj["mapidx_from_raw"]) atom_name = cls.access_with_deprecated_key( obj, old_key="atom_names", new_key="atom_name" ) monomer = cls( raw_input_mol=raw_rdkit_mol, rdkit_mol=rdkit_mol, mapidx_to_raw=mapidx_to_raw, input_resname=obj["input_resname"], template_key=obj["residue_template_key"], atom_names=atom_name, ) monomer.padded_mol=padded_mol monomer.molsetup=molsetup monomer.molsetup_mapidx=molsetup_mapidx monomer.is_flexres_atom=obj["is_flexres_atom"] monomer.is_movable=obj["is_movable"] monomer.mapidx_from_raw = mapidx_from_raw return monomer # endregion
[docs] def set_atom_names(self, atom_names_list): """ Set the atom names for the monomer. Parameters ---------- atom_names_list : list[str] A list of atom names to set for the monomer. The length of this list must match the number of atoms in the RDKit molecule. Returns ------- None """ if self.rdkit_mol is None: raise RuntimeError("can't set atom_names if rdkit_mol is not set yet") if len(atom_names_list) != self.rdkit_mol.GetNumAtoms(): raise ValueError( f"{len(atom_names_list)=} differs from {self.rdkit_mol.GetNumAtoms()=}" ) name_types = set([type(name) for name in atom_names_list]) if name_types != {str}: raise ValueError(f"atom names must be str but {name_types=}") self.atom_names = atom_names_list return
[docs] def parameterize(self, mk_prep, residue_id, get_atomprop_from_raw: dict = None): """ Parameterize the monomer using the provided mk_prep object. Parameters ---------- mk_prep : MoleculePreparation A MoleculePreparation object that provides the parameterization method. residue_id : str The residue ID to be used for parameterization. get_atomprop_from_raw : dict[str, any], optional A dictionary mapping atom property names to default values. If provided, these properties will be set on the atoms in the raw RDKit molecule. The default is None (not passing any properties). Raises ------ ValueError If the atom property names in get_atomprop_from_raw are not strings. NotImplementedError If the number of molsetups is not equal to 1. """ if get_atomprop_from_raw: if any(not isinstance(prop_name, str) for prop_name in get_atomprop_from_raw.keys()): raise ValueError(f"Atom property name must be str. Got {prop_name} ({type(prop_name)}) instead! ") raw_mol = self.raw_rdkit_mol atoms_in_raw_mol = [atom for atom in raw_mol.GetAtoms()] mapidx_to_raw = self.mapidx_to_raw molsetup_mapidx = self.molsetup_mapidx for atom in self.padded_mol.GetAtoms(): atom_idx_in_raw = mapidx_to_raw.get(molsetup_mapidx.get(atom.GetIdx(), None), None) for prop_name, default_value in get_atomprop_from_raw.items(): if atom_idx_in_raw is not None: prop_value = atoms_in_raw_mol[atom_idx_in_raw].GetProp(prop_name) else: prop_value = str(default_value) atom.SetProp(prop_name, prop_value) molsetups = mk_prep(self.padded_mol) if len(molsetups) != 1: raise NotImplementedError(f"need 1 molsetup but got {len(molsetups)}") molsetup = molsetups[0] self.molsetup = molsetup self.is_flexres_atom = [False for _ in molsetup.atoms] # set ignore to True for atoms that are padding for atom in molsetup.atoms: if atom.index not in self.molsetup_mapidx: atom.is_ignore = True # recalculate flexibility tree after setting ignored atoms mk_prep.calc_flex(molsetup) # rectify charges to sum to integer (because of padding) if mk_prep.charge_model == "zero": net_charge = 0 else: rdkit_mol = self.rdkit_mol net_charge = sum( [atom.GetFormalCharge() for atom in rdkit_mol.GetAtoms()] ) not_ignored_idxs = [] charges = [] for atom in molsetup.atoms: if atom.index in self.molsetup_mapidx: # TODO offsite not in mapidx charges.append(atom.charge) not_ignored_idxs.append(atom.index) charges = rectify_charges(charges, net_charge, decimals=3) for i, j in enumerate(not_ignored_idxs): molsetup.atoms[j].charge = charges[i] self._set_pdbinfo(residue_id) return
def _set_pdbinfo(self, residue_id): not_ignored_idxs = [] for atom in self.molsetup.atoms: if atom.index in self.molsetup_mapidx: # TODO offsite not in mapidx not_ignored_idxs.append(atom.index) chain, resnum = residue_id.split(":") if resnum[-1].isalpha(): icode = resnum[-1] resnum = resnum[:-1] else: icode = "" if self.atom_names is None: atom_names = ["" for _ in not_ignored_idxs] else: atom_names = self.atom_names for i, j in enumerate(not_ignored_idxs): atom_name = atom_names[self.molsetup_mapidx[j]] self.molsetup.atoms[j].pdbinfo = PDBAtomInfo( atom_name, self.input_resname, int(resnum), icode, chain ) return
[docs] class NoAtomMapWarning(logging.Filter): """ A logging filter to exclude warnings about missing atom mapping numbers in RDKit. """
[docs] def filter(self, record): """ Filter out warnings about missing atom mapping numbers in RDKit. Parameters ---------- record : logging.LogRecord The log record to be filtered. Returns ------- bool True if the record should be logged, False otherwise. """ fields = record.getMessage().split() a = " ".join(fields[1:4]) == "product atom-mapping number" b = " ".join(fields[5:]) == "not found in reactants." is_atom_map_warning = a and b return not is_atom_map_warning
[docs] class ResiduePadder(BaseJSONParsable): """ A class for padding RDKit molecules of residues with parts from adjacent residues. Attributes ---------- rxn : rdChemReactions.ChemicalReaction Reaction SMARTS of a single-reactant, single-product reaction for padding. adjacent_smartsmol : Chem.Mol SMARTS molecule with mapping numbers to copy atom positions from part of adjacent residue. adjacent_smartsmol_mapidx : list Mapping for atoms in adjacent_smartsmol, from mapping numbers to atom indicies. """ # Replacing ResidueConnection by ResiduePadding # Why have two ResiduePadding instances per connection between two-residues? # - three-way merge: if three carbons joined in cyclopropare, we can still pad # - defines padding in the reaction for blunt residues # - all bonds will be defined in the input topology after a future refactor # reaction should not delete atoms, not even Hs # reaction should create bonds at non-real Hs (implicit or explicit rdktt H) def __init__(self, rxn_smarts: str, adjacent_res_smarts: str = None, auto_blunt:bool=False): """ Initialize the ResiduePadder with reaction SMARTS and optional adjacent residue SMARTS. Parameters ---------- rxn_smarts: str Reaction SMARTS to pad a link atom of a Monomer molecule. Product atoms that are not mapped in the reactants will have their coordinates set from an adjacent residue molecule, given that adjacent_res_smarts is provided and the atom labels match the unmapped product atoms of rxn_smarts. adjacent_res_smarts: str SMARTS pattern to identify atoms in molecule of adjacent residue and copy their positions to padding atoms. The SMARTS atom labels must match those of the product atoms of rxn_smarts that are unmapped in the reagents. auto_blunt: bool missing bonds of Monomers will automatically be blunt if this parameter is true, and raise an error otherwise """ # Ensure rxn_smarts has single reactant and single product self.rxn = self._validate_rxn_smarts(rxn_smarts) self.auto_blunt = auto_blunt # Fill in adjacent_smartsmol_mapidx if adjacent_res_smarts is None: self.adjacent_smartsmol = None self.adjacent_smartsmol_mapidx = None return # Ensure adjacent_res_smarts is None or a valid SMARTS self.adjacent_smartsmol = self._initialize_adj_smartsmol(adjacent_res_smarts) # Ensure the mapping numbers are the same in adjacent_smartsmol and rxn_smarts's product self._check_adj_smarts(self.rxn, self.adjacent_smartsmol) self.adjacent_smartsmol_mapidx = { atom.GetIntProp("molAtomMapNumber"): atom.GetIdx() for atom in self.adjacent_smartsmol.GetAtoms() if atom.HasProp("molAtomMapNumber") } return @staticmethod def _validate_rxn_smarts(rxn_smarts: str) -> rdChemReactions.ChemicalReaction: """Validate rxn_smarts and return rxn""" rxn = rdChemReactions.ReactionFromSmarts(rxn_smarts) if rxn.GetNumReactantTemplates() != 1: raise ValueError(f"Expected 1 reactant, got {rxn.GetNumReactantTemplates()}.") if rxn.GetNumProductTemplates() != 1: raise ValueError(f"Expected 1 product, got {rxn.GetNumProductTemplates()}.") return rxn @staticmethod def _initialize_adj_smartsmol(adjacent_res_smarts: str) -> Chem.Mol: """Validate adjacent_res_smarts and return adjacent_smartsmol""" adjacent_smartsmol = Chem.MolFromSmarts(adjacent_res_smarts) if adjacent_smartsmol is None: raise RuntimeError("Invalid SMARTS pattern in adjacent_res_smarts") return adjacent_smartsmol @staticmethod def _check_adj_smarts(rxn: rdChemReactions.ChemicalReaction, adjacent_smartsmol: Chem.Mol): """Ensure the atom mapping numbers are the same in adjacent_smartsmol and rxn_smarts's product""" # Assumes single reactant, single product reactant_ids = get_molAtomMapNumbers(rxn.GetReactantTemplate(0)) product_ids = get_molAtomMapNumbers(rxn.GetProductTemplate(0)) adjacent_ids = get_molAtomMapNumbers(adjacent_smartsmol) padding_ids = product_ids.difference(reactant_ids) is_ok = padding_ids == adjacent_ids if not is_ok: raise ValueError(f"SMARTS labels in adjacent_smartsmol ({adjacent_ids}) differ from \ unmapped product labels in reaction ({padding_ids})") def __call__(self, target_mol: Chem.Mol, adjacent_mol = None, target_required_atom_index = None, adjacent_required_atom_index = None): """ Pad the target_mol with atoms from the adjacent_mol using the defined reaction. The target_mol must contain the reactant of the reaction, and the adjacent_mol must contain the expected adjacent_smartsmol. The function returns the padded molecule and the mapping index of the adjacent_smartsmol. Parameters ---------- target_mol : Chem.Mol The target molecule to be padded. It must contain the reactant of the reaction. adjacent_mol : Chem.Mol, optional The adjacent molecule from which atoms will be copied. It must contain the expected adjacent_smartsmol. target_required_atom_index : int, optional The atom index in the target_mol that must be present in the product. adjacent_required_atom_index : int, optional The atom index in the adjacent_mol that must be present in the product. Returns ------- padded_mol : Chem.Mol The padded molecule with additional atoms from the adjacent_mol. adjacent_smartsmol_mapidx : dict[int, int] Mapping of atom indices in the adjacent_smartsmol to the target_mol. Raises RuntimeError If the target_mol does not contain the reactant of the reaction or if the adjacent_mol does not contain the expected adjacent_smartsmol. ValueError If the target_mol does not contain the expected atom index or if the adjacent_mol does not contain the expected atom index. NotImplementedError If the reaction does not produce a single product or if the adjacent_mol does not contain the expected adjacent_smartsmol. """ # add Hs only to padding atoms # copy coordinates if adjacent res has Hs bound to heavy atoms # labels have been checked upstream # Ensure target_mol contains self.rxn's reactant rxn = self.rxn if not self._check_target_mol(target_mol): logger.info(f"target_mol ({Chem.MolToSmiles(target_mol)}) is not fully compliant with the template rxn ({rdChemReactions.ReactionToSmarts(self.rxn)})...") # Assumes single reactant and single product reactant_smartsmol = rxn.GetReactantTemplate(0) reactant_ids = get_molAtomMapNumbers(reactant_smartsmol) # Generate fallback options for reactants fallback_reactant_smartsmol = Chem.MolFromSmarts(rdFMCS.FindMCS([reactant_smartsmol, target_mol]).smartsString) if fallback_reactant_smartsmol is None: raise RuntimeError(f"There is no common substructure between target_mol and the expected reactant. ") # Add mapping number to fallback reactants and filter the fallback options # To be accepted, the fallback reactant needs to at least have a match with target_mol # containing target_mol's atom with target_required_atom_index fallback_reactants = [ reactant_mol for reactant_mol in apply_atom_mappings(fallback_reactant_smartsmol, reactant_smartsmol) if any(target_required_atom_index in match for match in target_mol.GetSubstructMatches(reactant_mol)) ] if len(fallback_reactants) == 0: raise RuntimeError(f"The maximum common substructure between target_mol and the expected reactant does not contain the expected linker atom with target_required_atom_index.") # Take any fallback reactant; actually, they're the same reactant mols having different mapping numbers fallback_reactant = fallback_reactants[0] # Modify rxn smarts and update rxn fallback_reactant_ids = get_molAtomMapNumbers(fallback_reactant) skipping_ids = reactant_ids.difference(fallback_reactant_ids) fallback_product = remove_atoms_with_mapping(rxn.GetProductTemplate(0), skipping_ids) fallback_rxnsmarts = f"{Chem.MolToSmarts(fallback_reactant)}>>{Chem.MolToSmarts(fallback_product)}" rxn = rdChemReactions.ReactionFromSmarts(fallback_rxnsmarts) logger.info(f"Switched from Template rxn ({rdChemReactions.ReactionToSmarts(self.rxn)}) to Fallback rxn ({fallback_rxnsmarts})") # Get adjacent_mol's reacting part that contains adjacent_required_atom_index if adjacent_mol is not None: # Ensure adjacent_mol contains expected_adjacent_smartsmol, and # there's exactly one match that includes atom with adjacent_required_atom_index if self._check_adjacent_mol(self.adjacent_smartsmol, adjacent_mol, adjacent_required_atom_index): adjacent_smartsmol = self.adjacent_smartsmol # Remove unmapped atoms from Template adjacent mol SMARTS as the fallback option; # The unmapped atoms aren't needed for positions anyways else: logger.info(f"adjacent_mol ({Chem.MolToSmiles(adjacent_mol)}) is not fully compliant with the template adjacent_smarts ({Chem.MolToSmarts(self.adjacent_smartsmol)})...") adjacent_smartsmol = remove_unmapped_atoms_from_mol(self.adjacent_smartsmol) # Evaluate adjacent mol against the fallback adjacent mol SMARTS if self._check_adjacent_mol(adjacent_smartsmol, adjacent_mol, adjacent_required_atom_index): logger.info(f"Switched from Template adjacent mol ({Chem.MolToSmarts(self.adjacent_smartsmol)}) to Fallback adjacent mol ({Chem.MolToSmarts(adjacent_smartsmol)})") else: raise RuntimeError(f"adjacent_mol doesn't contain the mapped atoms in adjacent_smartsmol.") # Update hit and adjacent_smartsmol_mapidx hit = adjacent_mol.GetSubstructMatches(adjacent_smartsmol)[0] adjacent_smartsmol_mapidx = { atom.GetIntProp("molAtomMapNumber"): atom.GetIdx() for atom in adjacent_smartsmol.GetAtoms() if atom.HasProp("molAtomMapNumber") } # suppress rdkit warning about product atom map not found in reactants # e.g. in "[C:1]>>[C:1][O:2]" label :2 is missing in reactants filtr = NoAtomMapWarning() rdkit_logger.addFilter(filtr) # Get padded mol and index map from the rxn outcomes = react_and_map((target_mol,), rxn) rdkit_logger.removeFilter(filtr) # Filter outcomes by target_required_atom_index if target_required_atom_index is not None: outcomes = [ (product, index_map) for (product, index_map) in outcomes if target_required_atom_index in index_map["atom_idx"] ] # Ensure single outcome if len(outcomes) == 0: raise RuntimeError(f"The padding reaction of target_mol has no outcome that contains the atom with target_required_atom_index") elif len(outcomes) > 1: raise RuntimeError(f"The padding reaction of target_mol has multiple outcomes that contain the atom with target_required_atom_index") padded_mol, idxmap = outcomes[0] padding_heavy_atoms = [ i for i, j in enumerate(idxmap["atom_idx"]) if j is None and padded_mol.GetAtomWithIdx(i).GetAtomicNum() != 1 ] mapidx = idxmap["atom_idx"] # Add Hs to padded_mol and update mapidx if adjacent_mol is None: padded_mol.UpdatePropertyCache() # avoids getNumImplicitHs() called without preceding call to calcImplicitValence() Chem.SanitizeMol(padded_mol) # just in case padded_h = Chem.AddHs(padded_mol, onlyOnAtoms=padding_heavy_atoms) mapidx += [None] * (padded_h.GetNumAtoms() - padded_mol.GetNumAtoms()) else: # Get coordinates of existing atoms adjacent_coords = adjacent_mol.GetConformer().GetPositions() for atom in adjacent_smartsmol.GetAtoms(): if not atom.HasProp("molAtomMapNumber"): continue j = atom.GetIntProp("molAtomMapNumber") k = idxmap["new_atom_label"].index(j) l = adjacent_smartsmol_mapidx[j] padded_mol.GetConformer().SetAtomPosition(k, adjacent_coords[hit[l]]) padded_mol.UpdatePropertyCache() # avoids getNumImplicitHs() called without preceding call to calcImplicitValence() Chem.SanitizeMol(padded_mol) # got crooked Hs without this padded_h = Chem.AddHs( padded_mol, onlyOnAtoms=padding_heavy_atoms, addCoords=True ) return padded_h, mapidx @staticmethod def _check_adjacent_mol(expected_adjacent_smartsmol: Chem.Mol, adjacent_mol: Chem.Mol, adjacent_required_atom_index: str): """ Ensure adjacent_mol contains expected_adjacent_smartsmol, and there's exactly one match that includes atom with adjacent_required_atom_index. """ if expected_adjacent_smartsmol is None: raise RuntimeError("adjacent_res_smarts must be initialized to support adjacent_mol.") hits = adjacent_mol.GetSubstructMatches(expected_adjacent_smartsmol) if adjacent_required_atom_index is not None: hits = [hit for hit in hits if adjacent_required_atom_index in hit] if len(hits) > 1: raise RuntimeError(f"adjacent_mol has multiple matches for adjacent_smartsmol.") elif len(hits) == 0: return False return True def _check_target_mol(self, target_mol: Chem.Mol): """Ensure target_mol contains self.rxn's reactant.""" # Assumes single reactant if target_mol.GetSubstructMatches(self.rxn.GetReactantTemplate(0)): return True else: return False # region JSON-interchange functions
[docs] @classmethod def json_encoder(cls, obj: "ResiduePadder") -> Optional[dict[str, Any]]: output_dict = { "rxn_smarts": rdChemReactions.ReactionToSmarts(obj.rxn), "adjacent_res_smarts": serialize_optional(Chem.MolToSmarts, obj.adjacent_smartsmol), "auto_blunt": obj.auto_blunt, } # we are not serializing the adjacent_smartsmol_mapidx as that will # be rebuilt by the ResiduePadder init return output_dict
# Keys to check for deserialized JSON expected_json_keys = { "rxn_smarts", "adjacent_res_smarts", "auto_blunt", } @classmethod def _decode_object(cls, obj: dict[str, Any]): adjacent_res_smarts = cls.access_with_deprecated_key( obj, old_key="adjacent_smarts", new_key="adjacent_res_smarts" ) residue_padder = cls(obj["rxn_smarts"], adjacent_res_smarts, obj.get("auto_blunt", False)) return residue_padder
# endregion # Utility Functions
[docs] def get_molAtomMapNumbers(mol: Chem.Mol) -> set[int]: """Return the set of mapping numbers in a molecule.""" return {atom.GetIntProp("molAtomMapNumber") for atom in mol.GetAtoms() if atom.HasProp("molAtomMapNumber")}
[docs] def remove_unmapped_atoms_from_mol(mol: Chem.Mol) -> Chem.Mol: """Remove atoms without mapping numbers from a molecule.""" atoms_to_remove = [ atom.GetIdx() for atom in mol.GetAtoms() if not atom.HasProp("molAtomMapNumber") ] if len(atoms_to_remove) > 0: mol = Chem.RWMol(mol) for idx in sorted(atoms_to_remove, reverse=True): mol.RemoveAtom(idx) mol = mol.GetMol() return mol
[docs] def apply_atom_mappings(mcs_mol: Chem.Mol, original_mol: Chem.Mol) -> list[Chem.Mol]: """ Apply atom mappings from the original molecule to the MCS molecule by substructure match. Be prepared for multiple matches, return a list for further evaluation. """ # Assumes original_mol contains mcs_mol matches = original_mol.GetSubstructMatches(mcs_mol) mapped_mcs_molecules = [] for match in matches: rw_mcs_mol = Chem.RWMol(mcs_mol) for i, mcs_atom in enumerate(rw_mcs_mol.GetAtoms()): original_atom_idx = match[i] original_atom = original_mol.GetAtomWithIdx(original_atom_idx) if original_atom.HasProp("molAtomMapNumber"): mcs_atom.SetProp("molAtomMapNumber", original_atom.GetProp("molAtomMapNumber")) mapped_mcs_molecules.append(rw_mcs_mol.GetMol()) return mapped_mcs_molecules
[docs] def remove_atoms_with_mapping(product: Chem.Mol, mapping_numbers: set) -> Chem.Mol: """Remove atoms with specific atom mapping numbers from a molecule.""" editable_product = Chem.RWMol(product) atoms_to_remove = [ atom.GetIdx() for atom in editable_product.GetAtoms() if atom.HasProp("molAtomMapNumber") and int(atom.GetProp("molAtomMapNumber")) in mapping_numbers ] for idx in sorted(atoms_to_remove, reverse=True): editable_product.RemoveAtom(idx) return editable_product.GetMol()
[docs] class ResidueTemplate(BaseJSONParsable): """ Data and methods to pad rdkit molecules of polymer residues with parts of adjacent residues. Attributes ---------- mol : Chem.Mol molecule with the exact atoms that constitute the system. All Hs are explicit, but atoms bonded to adjacent residues miss an H. link_labels: dict[int, str] Keys are indices of atoms that need padding Values are strings to identify instances of ResiduePadder atom_names: list[str] list of atom names, matching order of atoms in rdkit mol """ def __init__(self, smiles, link_labels=None, atom_names=None): """ Initialize a ResidueTemplate object. Parameters ---------- smiles : str SMILES representation of the molecule. link_labels : dict[int,str], optional Keys are indices of atoms that need padding Values are strings to identify instances of ResiduePadder If None, no link atoms. The default is None. atom_names : list[str], optional List of atom names in the same order as the atoms in the given smiles. """ # Initializer attributes self.link_labels = link_labels self.atom_names = atom_names # (JSON-bound) computed attributes ps = Chem.SmilesParserParams() ps.removeHs = False mol = Chem.MolFromSmiles(smiles, ps) self.check(mol, link_labels, atom_names) self.mol = mol # region JSON-interchange functions
[docs] @classmethod def json_encoder(cls, obj: "ResidueTemplate") -> Optional[dict[str, Any]]: output_dict = { "mol": rdMolInterchange.MolToJSON(obj.mol), "link_labels": obj.link_labels, "atom_name": obj.atom_names, } return output_dict
# Keys to check for deserialized JSON expected_json_keys = {"mol", "link_labels", "atom_name"} @classmethod def _decode_object(cls, obj: dict[str, Any]): # Converting ResidueTemplate init values that need conversion deserialized_mol = rdkit_mol_from_json(obj.get("mol")) # do not write canonical smiles to preserve original atom order if deserialized_mol: mol_smiles = rdkit.Chem.MolToSmiles(deserialized_mol, canonical=False) # if dry json (data) is supplied else: mol_smiles = obj.get("smiles") link_labels = convert_to_int_keyed_dict(obj.get("link_labels")) atom_name = cls.access_with_deprecated_key(obj, old_key="atom_names", new_key="atom_name") # Construct a ResidueTemplate object residue_template = cls(mol_smiles, None, atom_name) # Separately ensure that link_labels is restored to the value we expect it to be so there are not errors in # the constructor residue_template.link_labels = link_labels return residue_template # endregion
[docs] def check(self, mol, link_labels, atom_names): """ Check the validity of a ResidueTemplate using the rdkit mol. Parameters ---------- mol : Chem.Mol The molecule to check. link_labels : dict[int, str], optional Keys are indices of atoms that need padding Values are strings to identify instances of ResiduePadder atom_names : list[str], optional List of atom names in the same order as the atoms in the given smiles. Raises ------ ValueError If the number of atoms in the molecule does not match the length of atom_names. RuntimeError If the molecule is not a valid SMILES representation. """ have_implicit_hs = set() for atom in mol.GetAtoms(): if atom.GetTotalNumHs() > 0: have_implicit_hs.add(atom.GetIdx()) if link_labels is not None and set(link_labels) != have_implicit_hs: raise ValueError( f"expected any atom with non-real Hs ({have_implicit_hs}) to be in {link_labels=}" ) if atom_names is None: return # data_lengths = set([len(values) for (_, values) in data.items()]) # if len(data_lengths) != 1: # raise ValueError(f"each array in data must have the same length, but got {data_lengths=}") # data_length = data_lengths.pop() if len(atom_names) != mol.GetNumAtoms(): raise ValueError(f"{len(atom_names)=} differs from {mol.GetNumAtoms()=}") return
[docs] def match(self, input_mol): """ Match the input molecule with the template molecule and return the mapping. Parameters ---------- input_mol : Chem.Mol The input molecule to be matched with the template. Returns ------- tuple[dict, dict] A tuple containing two dictionaries: - The first dictionary contains the results of the matching, including the number of found and missing atoms. - The second dictionary contains the mapping between the template and input molecule atom indices. Raises ------ RuntimeError If there are repeated values with different keys in the mapping. """ mapping = mapping_by_mcs(self.mol, input_mol) mapping_inv = {value: key for (key, value) in mapping.items()} if len(mapping_inv) != len(mapping): raise RuntimeError( f"bug in atom indices, repeated value different keys? {mapping=}" ) # atoms "missing" exist in self.mol but not in input_mol # "excess" atoms exist in input_mol but not in self.mol result = { "H": {"found": 0, "missing": 0, "excess": []}, "heavy": {"found": 0, "missing": 0, "excess": 0}, } for atom in self.mol.GetAtoms(): element = "H" if atom.GetAtomicNum() == 1 else "heavy" key = "found" if atom.GetIdx() in mapping else "missing" result[element][key] += 1 for atom in input_mol.GetAtoms(): element = "H" if atom.GetAtomicNum() == 1 else "heavy" if atom.GetIdx() not in mapping_inv: if element == "H": if atom.GetNeighbors(): nei_idx = atom.GetNeighbors()[0].GetIdx() if nei_idx in mapping_inv: result[element]["excess"].append(mapping_inv[nei_idx]) else: result[element]["excess"].append(-1) else: # lone hydrogen found in monomer monomer_info = getPdbInfoNoNull(atom) if monomer_info: logger.warning(f"WARNING: Lone hydrogen is ignored: \n" f" {monomer_info} \n") else: logger.warning(f"WARNING: A lone hydrogen is ignored during monomer-template matching. \n") else: result[element]["excess"] += 1 return result, mapping