ctb_rdkit_descriptors: dimorphite

comparison dimorphite_dl.py @ 4:414edd9ea77d draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit 20df7e562341cd30e89a14d6bde9054956fadc06"

author	bgruening
date	Tue, 10 Mar 2020 12:58:24 -0400
parents
children	a1c53f0533b0

comparison

equal deleted inserted replaced

-:617d4555d8d3
+:414edd9ea77d
+# Copyright 2018 Jacob D. Durrant
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+This script identifies and enumerates the possible protonation sites of SMILES
+strings.
+"""
+from __future__ import print_function
+import copy
+import os
+import argparse
+import sys
+try:
+# Python2
+from StringIO import StringIO
+except ImportError:
+# Python3
+from io import StringIO
+# Always let the user know a help file is available.
+print("\nFor help, use: python dimorphite_dl.py --help")
+# And always report citation information.
+print("\nIf you use Dimorphite-DL in your research, please cite:")
+print("Ropp PJ, Kaminsky JC, Yablonski S, Durrant JD (2019) Dimorphite-DL: An")
+print("open-source program for enumerating the ionization states of drug-like small")
+print("molecules. J Cheminform 11:14. doi:10.1186/s13321-019-0336-9.\n")
+try:
+import rdkit
+from rdkit import Chem
+from rdkit.Chem import AllChem
+except:
+msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/"
+print(msg)
+raise Exception(msg)
+def main(params=None):
+"""The main definition run when you call the script from the commandline.
+:param params: The parameters to use. Entirely optional. If absent,
+defaults to None, in which case argments will be taken from
+those given at the command line.
+:param params: dict, optional
+:return: Returns a list of the SMILES strings return_as_list parameter is
+True. Otherwise, returns None.
+"""
+parser = ArgParseFuncs.get_args()
+args = vars(parser.parse_args())
+# Add in any parameters in params.
+if params is not None:
+for k, v in params.items():
+args[k] = v
+# If being run from the command line, print out all parameters.
+if __name__ == "__main__":
+print("\nPARAMETERS:\n")
+for k in sorted(args.keys()):
+print(k.rjust(13) + ": " + str(args[k]))
+print("")
+if args["test"]:
+# Run tests.
+TestFuncs.test()
+else:
+# Run protonation
+if "output_file" in args and args["output_file"] is not None:
+# An output file was specified, so write to that.
+with open(args["output_file"], "w") as file:
+for protonated_smi in Protonate(args):
+file.write(protonated_smi + "\n")
+elif "return_as_list" in args and args["return_as_list"] == True:
+return list(Protonate(args))
+else:
+# No output file specified. Just print it to the screen.
+for protonated_smi in Protonate(args):
+print(protonated_smi)
+class MyParser(argparse.ArgumentParser):
+"""Overwrite default parse so it displays help file on error. See
+https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu"""
+def error(self, message):
+"""Overwrites the default error message.
+:param message: The default error message.
+"""
+self.print_help()
+msg = "ERROR: %s\n\n" % message
+print(msg)
+raise Exception(msg)
+def print_help(self, file=None):
+"""Overwrite the default print_help function
+:param file: Output file, defaults to None
+"""
+print("")
+if file is None:
+file = sys.stdout
+self._print_message(self.format_help(), file)
+print("""
+examples:
+python dimorphite_dl.py --smiles_file sample_molecules.smi
+python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0
+python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi
+python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states
+python dimorphite_dl.py --test""")
+print("")
+class ArgParseFuncs:
+"""A namespace for storing functions that are useful for processing
+command-line arguments. To keep things organized."""
+@staticmethod
+def get_args():
+"""Gets the arguments from the command line.
+:return: A parser object.
+"""
+parser = MyParser(description="Dimorphite 1.2: Creates models of " +
+"appropriately protonated small moleucles. " +
+"Apache 2.0 License. Copyright 2018 Jacob D. " +
+"Durrant.")
+parser.add_argument('--min_ph', metavar='MIN', type=float, default=6.4,
+help='minimum pH to consider (default: 6.4)')
+parser.add_argument('--max_ph', metavar='MAX', type=float, default=8.4,
+help='maximum pH to consider (default: 8.4)')
+parser.add_argument('--pka_precision', metavar='PRE', type=float, default=1.0,
+help='pKa precision factor (number of standard devations, default: 1.0)')
+parser.add_argument('--smiles', metavar='SMI', type=str,
+help='SMILES string to protonate')
+parser.add_argument('--smiles_file', metavar="FILE", type=str,
+help='file that contains SMILES strings to protonate')
+parser.add_argument('--output_file', metavar="FILE", type=str,
+help='output file to write protonated SMILES (optional)')
+parser.add_argument('--label_states', action="store_true",
+help='label protonated SMILES with target state ' + \
+'(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").')
+parser.add_argument('--test', action="store_true",
+help='run unit tests (for debugging)')
+return parser
+@staticmethod
+def clean_args(args):
+"""Cleans and normalizes input parameters
+:param args: A dictionary containing the arguments.
+:type args: dict
+:raises Exception: No SMILES in params.
+"""
+defaults = {'min_ph' : 6.4,
+'max_ph' : 8.4,
+'pka_precision' : 1.0,
+'label_states' : False,
+'test' : False}
+for key in defaults:
+if key not in args:
+args[key] = defaults[key]
+keys = list(args.keys())
+for key in keys:
+if args[key] is None:
+del args[key]
+if not "smiles" in args and not "smiles_file" in args:
+msg = "Error: No SMILES in params. Use the -h parameter for help."
+print(msg)
+raise Exception(msg)
+# If the user provides a smiles string, turn it into a file-like StringIO
+# object.
+if "smiles" in args:
+if isinstance(args["smiles"], str):
+args["smiles_file"]  = StringIO(args["smiles"])
+args["smiles_and_data"] = LoadSMIFile(args["smiles_file"])
+return args
+class UtilFuncs:
+"""A namespace to store functions for manipulating mol objects. To keep
+things organized."""
+@staticmethod
+def neutralize_mol(mol):
+"""All molecules should be neuralized to the extent possible. The user
+should not be allowed to specify the valence of the atoms in most cases.
+:param rdkit.Chem.rdchem.Mol mol: The rdkit Mol objet to be neutralized.
+:return: The neutralized Mol object.
+"""
+# Get the reaction data
+rxn_data = [
+['[Ov1-1:1]', '[Ov2+0:1]-[H]'],  # To handle O- bonded to only one atom (add hydrogen).
+['[#7v4+1:1]-[H]', '[#7v3+0:1]'],  # To handle N+ bonded to a hydrogen (remove hydrogen).
+['[Ov2-:1]', '[Ov2+0:1]'],  # To handle O- bonded to two atoms. Should not be Negative.
+['[#7v3+1:1]', '[#7v3+0:1]'],  # To handle N+ bonded to three atoms. Should not be positive.
+['[#7v2-1:1]', '[#7+0:1]-[H]'],  # To handle N- Bonded to two atoms. Add hydrogen.
+# ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'],  # To
+# handle bad azide. Must be protonated. (Now handled elsewhere, before
+# SMILES converted to Mol object.)
+['[H]-[N:1]-[N:2]#[N:3]', '[N:1]=[N+1:2]=[N:3]-[H]']  # To handle bad azide. R-N-N#N should be R-N=[N+]=N
+]
+# Add substructures and reactions (initially none)
+for i, rxn_datum in enumerate(rxn_data):
+rxn_data[i].append(Chem.MolFromSmarts(rxn_datum[0]))
+rxn_data[i].append(None)
+# Add hydrogens (respects valence, so incomplete).
+# Chem.calcImplicitValence(mol)
+mol.UpdatePropertyCache(strict=False)
+mol = Chem.AddHs(mol)
+while True:  # Keep going until all these issues have been resolved.
+current_rxn = None  # The reaction to perform.
+current_rxn_str = None
+for i, rxn_datum in enumerate(rxn_data):
+reactant_smarts, product_smarts, substruct_match_mol, rxn_placeholder = rxn_datum
+if mol.HasSubstructMatch(substruct_match_mol):
+if rxn_placeholder is None:
+current_rxn_str = reactant_smarts + '>>' + product_smarts
+current_rxn = AllChem.ReactionFromSmarts(current_rxn_str)
+rxn_data[i][3] = current_rxn  # Update the placeholder.
+else:
+current_rxn = rxn_data[i][3]
+break
+# Perform the reaction if necessary
+if current_rxn is None:  # No reaction left, so break out of while loop.
+break
+else:
+mol = current_rxn.RunReactants((mol,))[0][0]
+mol.UpdatePropertyCache(strict=False)  # Update valences
+# The mols have been altered from the reactions described above, we need
+# to resanitize them. Make sure aromatic rings are shown as such This
+# catches all RDKit Errors. without the catchError and sanitizeOps the
+# Chem.SanitizeMol can crash the program.
+sanitize_string =  Chem.SanitizeMol(
+mol,
+sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL,
+catchErrors = True
+)
+return mol if sanitize_string.name == "SANITIZE_NONE" else None
+@staticmethod
+def convert_smiles_str_to_mol(smiles_str):
+"""Given a SMILES string, check that it is actually a string and not a
+None. Then try to convert it to an RDKit Mol Object.
+:param string smiles_str: The SMILES string.
+:return: A rdkit.Chem.rdchem.Mol object, or None if it is the wrong type or
+if it fails to convert to a Mol Obj
+"""
+# Check that there are no type errors, ie Nones or non-string
+# A non-string type will cause RDKit to hard crash
+if smiles_str is None or type(smiles_str) is not str:
+return None
+# Try to fix azides here. They are just tricky to deal with.
+smiles_str = smiles_str.replace("N=N=N", "N=[N+]=N")
+smiles_str = smiles_str.replace("NN#N", "N=[N+]=N")
+# Now convert to a mol object. Note the trick that is necessary to
+# capture RDKit error/warning messages. See
+# https://stackoverflow.com/questions/24277488/in-python-how-to-capture-the-stdout-from-a-c-shared-library-to-a-variable
+stderr_fileno = sys.stderr.fileno()
+stderr_save = os.dup(stderr_fileno)
+stderr_pipe = os.pipe()
+os.dup2(stderr_pipe[1], stderr_fileno)
+os.close(stderr_pipe[1])
+mol = Chem.MolFromSmiles(smiles_str)
+os.close(stderr_fileno)
+os.close(stderr_pipe[0])
+os.dup2(stderr_save, stderr_fileno)
+os.close(stderr_save)
+# Check that there are None type errors Chem.MolFromSmiles has sanitize on
+# which means if there is even a small error in the SMILES (kekulize,
+# nitrogen charge...) then mol=None. ie.
+# Chem.MolFromSmiles("C[N]=[N]=[N]") = None this is an example of an
+# nitrogen charge error. It is cased in a try statement to be overly
+# cautious.
+return None if mol is None else mol
+@staticmethod
+def eprint(*args, **kwargs):
+"""Error messages should be printed to STDERR. See
+https://stackoverflow.com/questions/5574702/how-to-print-to-stderr-in-python"""
+print(*args, file=sys.stderr, **kwargs)
+class LoadSMIFile(object):
+"""A generator class for loading in the SMILES strings from a file, one at
+a time."""
+def __init__(self, filename):
+"""Initializes this class.
+:param filename: The filename or file object (i.e., StringIO).
+:type filename: str or StringIO
+"""
+if type(filename) is str:
+# It's a filename
+self.f = open(filename, "r")
+else:
+# It's a file object (i.e., StringIO)
+self.f = filename
+def __iter__(self):
+"""Returns this generator object.
+:return: This generator object.
+:rtype: LoadSMIFile
+"""
+return self
+def __next__(self):
+"""Ensure Python3 compatibility.
+:return: A dict, where the "smiles" key contains the canonical SMILES
+string and the "data" key contains the remaining information
+(e.g., the molecule name).
+:rtype: dict
+"""
+return self.next()
+def next(self):
+"""Get the data associated with the next line.
+:raises StopIteration: If there are no more lines left iin the file.
+:return: A dict, where the "smiles" key contains the canonical SMILES
+string and the "data" key contains the remaining information
+(e.g., the molecule name).
+:rtype: dict
+"""
+line = self.f.readline()
+if line == "":
+# EOF
+self.f.close()
+raise StopIteration()
+return
+# Divide line into smi and data
+splits = line.split()
+if len(splits) != 0:
+# Generate mol object
+smiles_str = splits[0]
+# Convert from SMILES string to RDKIT Mol. This series of tests is
+# to make sure the SMILES string is properly formed and to get it
+# into a canonical form. Filter if failed.
+mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str)
+if mol is None:
+UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+return self.next()
+# Handle nuetralizing the molecules. Filter if failed.
+mol = UtilFuncs.neutralize_mol(mol)
+if mol is None:
+UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+return self.next()
+# Remove the hydrogens.
+try:
+mol = Chem.RemoveHs(mol)
+except:
+UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+return self.next()
+if mol is None:
+UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+return self.next()
+# Regenerate the smiles string (to standardize).
+new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True)
+return {
+"smiles": new_mol_string,
+"data": splits[1:]
+}
+else:
+# Blank line? Go to next one.
+return self.next()
+class Protonate(object):
+"""A generator class for protonating SMILES strings, one at a time."""
+def __init__(self, args):
+"""Initialize the generator.
+:param args: A dictionary containing the arguments.
+:type args: dict
+"""
+# Make the args an object variable variable.
+self.args = args
+# A list to store the protonated SMILES strings associated with a
+# single input model.
+self.cur_prot_SMI = []
+# Clean and normalize the args
+self.args = ArgParseFuncs.clean_args(args)
+# Load the substructures that can be protonated.
+self.subs = ProtSubstructFuncs.load_protonation_substructs_calc_state_for_ph(
+self.args["min_ph"], self.args["max_ph"], self.args["pka_precision"]
+)
+def __iter__(self):
+"""Returns this generator object.
+:return: This generator object.
+:rtype: Protonate
+"""
+return self
+def __next__(self):
+"""Ensure Python3 compatibility.
+:return: A dict, where the "smiles" key contains the canonical SMILES
+string and the "data" key contains the remaining information
+(e.g., the molecule name).
+:rtype: dict
+"""
+return self.next()
+def next(self):
+"""Get the next protonated SMILES string.
+:raises StopIteration: If there are no more lines left iin the file.
+:return: TODO A dict, where the "smiles" key contains the canonical SMILES
+string and the "data" key contains the remaining information
+(e.g., the molecule name).
+:rtype: dict
+"""
+# If there are any SMILES strings in self.cur_prot_SMI, just return
+# the first one and update the list to include only the remaining.
+if len(self.cur_prot_SMI) > 0:
+first, self.cur_prot_SMI = self.cur_prot_SMI[0], self.cur_prot_SMI[1:]
+return first
+# self.cur_prot_SMI is empty, so try to add more to it.
+# Get the next SMILES string from the input file.
+try:
+smile_and_datum = self.args["smiles_and_data"].next()
+except StopIteration:
+# There are no more input smiles strings...
+raise StopIteration()
+smi = smile_and_datum["smiles"]
+data = smile_and_datum["data"]  # Everything on SMILES line but the
+# SMILES string itself (e.g., the
+# molecule name).
+# Collect the data associated with this smiles (e.g., the molecule
+# name).
+tag = " ".join(data)
+# sites is a list of (atom index, "PROTONATED|DEPROTONATED|BOTH").
+# Note that the second entry indicates what state the site SHOULD be
+# in (not the one it IS in per the SMILES string). It's calculated
+# based on the probablistic distributions obtained during training.
+sites = ProtSubstructFuncs.get_prot_sites_and_target_states(smi, self.subs)
+new_smis = [smi]
+for site in sites:
+# Make a new smiles with the correct protonation state. Note that
+# new_smis is a growing list. This is how multiple protonation
+# sites are handled.
+# new_smis_to_perhaps_add = ProtSubstructFuncs.protonate_site(new_smis, site)
+new_smis = ProtSubstructFuncs.protonate_site(new_smis, site)
+# print(site, new_smis)  # Good for debugging.
+# Only add new smiles if not already in the list.
+# for s in new_smis_to_perhaps_add:
+# if not s in new_smis:
+# new_smis.append(s)
+# In some cases, the script might generate redundant molecules.
+# Phosphonates, when the pH is between the two pKa values and the
+# stdev value is big enough, for example, will generate two identical
+# BOTH states. Let's remove this redundancy.
+new_smis = list(set(new_smis))
+# Deprotonating protonated aromatic nitrogen gives [nH-]. Change this
+# to [n-]. This is a hack.
+new_smis = [s.replace("[nH-]", "[n-]") for s in new_smis]
+# Sometimes Dimorphite-DL generates molecules that aren't actually
+# possible. Simply convert these to mol objects to eliminate the bad
+# ones (that are None).
+new_smis = [s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None]
+# If there are no smi left, return the input one at the very least.
+# All generated forms have apparently been judged
+# inappropriate/mal-formed.
+if len(new_smis) == 0:
+new_smis = [smi]
+# If the user wants to see the target states, add those
+# to the ends of each line.
+if self.args["label_states"]:
+states = '\t'.join([x[1] for x in sites])
+new_lines = [x + "\t" + tag + "\t" + states for x in new_smis]
+else:
+new_lines = [x + "\t" + tag for x in new_smis]
+self.cur_prot_SMI = new_lines
+return self.next()
+class ProtSubstructFuncs:
+"""A namespace to store functions for loading the substructures that can
+be protonated. To keep things organized."""
+@staticmethod
+def load_protonation_substructs_calc_state_for_ph(min_ph=6.4, max_ph=8.4, pka_std_range=1):
+"""A pre-calculated list of R-groups with protonation sites, with their
+likely pKa bins.
+:param float min_ph:  The lower bound on the pH range, defaults to 6.4.
+:param float max_ph:  The upper bound on the pH range, defaults to 8.4.
+:param pka_std_range: Basically the precision (stdev from predicted pKa to
+consider), defaults to 1.
+:return: A dict of the protonation substructions for the specified pH
+range.
+"""
+subs = []
+pwd = os.path.dirname(os.path.realpath(__file__))
+site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts")
+with open(site_structures_file, 'r') as substruct:
+for line in substruct:
+line = line.strip()
+sub = {}
+if line is not "":
+splits = line.split()
+sub["name"] = splits[0]
+sub["smart"] = splits[1]
+sub["mol"] = Chem.MolFromSmarts(sub["smart"])
+# NEED TO DIVIDE THIS BY 3s
+pka_ranges = [splits[i:i+3] for i in range(2, len(splits)-1, 3)]
+prot = []
+for pka_range in pka_ranges:
+site = pka_range[0]
+std = float(pka_range[2]) * pka_std_range
+mean = float(pka_range[1])
+protonation_state = ProtSubstructFuncs.define_protonation_state(
+mean, std, min_ph, max_ph
+)
+prot.append([site, protonation_state])
+sub["prot_states_for_pH"] = prot
+subs.append(sub)
+return subs
+@staticmethod
+def define_protonation_state(mean, std, min_ph, max_ph):
+"""Updates the substructure definitions to include the protonation state
+based on the user-given pH range. The size of the pKa range is also based
+on the number of standard deviations to be considered by the user param.
+:param float mean:   The mean pKa.
+:param float std:    The precision (stdev).
+:param float min_ph: The min pH of the range.
+:param float max_ph: The max pH of the range.
+:return: A string describing the protonation state.
+"""
+min_pka = mean - std
+max_pka = mean + std
+# This needs to be reassigned, and 'ERROR' should never make it past the
+# next set of checks.
+if min_pka <= max_ph and min_ph <= max_pka:
+protonation_state = 'BOTH'
+elif mean > max_ph:
+protonation_state = 'PROTONATED'
+else:
+protonation_state = 'DEPROTONATED'
+return protonation_state
+@staticmethod
+def get_prot_sites_and_target_states(smi, subs):
+"""For a single molecule, find all possible matches in the protonation
+R-group list, subs. Items that are higher on the list will be matched
+first, to the exclusion of later items.
+:param string smi: A SMILES string.
+:param list subs: Substructure information.
+:return: A list of protonation sites and their pKa bin. ('PROTONATED',
+'BOTH', or  'DEPROTONATED')
+"""
+# Convert the Smiles string (smi) to an RDKit Mol Obj
+mol = UtilFuncs.convert_smiles_str_to_mol(smi)
+# Check Conversion worked
+if mol is None:
+UtilFuncs.eprint("ERROR:   ", smi)
+return []
+# Try to Add hydrogens. if failed return []
+try:
+mol =  Chem.AddHs(mol)
+except:
+UtilFuncs.eprint("ERROR:   ", smi)
+return []
+# Check adding Hs worked
+if mol is None:
+UtilFuncs.eprint("ERROR:   ", smi)
+return []
+ProtectUnprotectFuncs.unprotect_molecule(mol)
+protonation_sites = []
+for item in subs:
+smart = item["mol"]
+if mol.HasSubstructMatch(smart):
+matches = ProtectUnprotectFuncs.get_unprotected_matches(mol, smart)
+prot = item["prot_states_for_pH"]
+for match in matches:
+# We want to move the site from being relative to the
+# substructure, to the index on the main molecule.
+for site in prot:
+proton = int(site[0])
+category = site[1]
+new_site = (match[proton], category, item["name"])
+if not new_site in protonation_sites:
+# Because sites must be unique.
+protonation_sites.append(new_site)
+ProtectUnprotectFuncs.protect_molecule(mol, match)
+return protonation_sites
+@staticmethod
+def protonate_site(smis, site):
+"""Given a list of SMILES strings, we protonate the site.
+:param list smis:  The list of SMILES strings.
+:param tuple site: Information about the protonation site.
+(idx, target_prot_state, prot_site_name)
+:return: A list of the appropriately protonated SMILES.
+"""
+# Decouple the atom index and its target protonation state from the site
+# tuple
+idx, target_prot_state, prot_site_name = site
+# Initialize the output list
+output_smis = []
+state_to_charge = {"DEPROTONATED": [-1],
+"PROTONATED": [0],
+"BOTH": [-1, 0]}
+charges = state_to_charge[target_prot_state]
+# Now make the actual smiles match the target protonation state.
+output_smis = ProtSubstructFuncs.set_protonation_charge(smis, idx, charges, prot_site_name)
+return output_smis
+@staticmethod
+def set_protonation_charge(smis, idx, charges, prot_site_name):
+"""Sets the atomic charge on a particular site for a set of SMILES.
+:param list smis:             A list of the SMILES strings.
+:param int idx:               The index of the atom to consider.
+:param list charges:          A list of the charges (ints) to assign at
+this site.
+:param string prot_site_name: The name of the protonation site.
+:return: A list of the processed SMILES strings.
+"""
+# Sets up the output list and the Nitrogen charge
+output = []
+for charge in charges:
+# The charge for Nitrogens is 1 higher than others (i.e., protonated
+# state is positively charged).
+nitro_charge = charge + 1
+# But there are a few nitrogen moieties where the acidic group is the
+# neutral one. Amides are a good example. I gave some thought re. how
+# to best flag these. I decided that those nitrogen-containing
+# moieties where the acidic group is neutral (rather than positively
+# charged) will have "*" in the name.
+if "*" in prot_site_name:
+nitro_charge = nitro_charge - 1  # Undo what was done previously.
+for smi in smis:
+# Convert smilesstring (smi) into a RDKit Mol
+mol = UtilFuncs.convert_smiles_str_to_mol(smi)
+# Check that the conversion worked, skip if it fails
+if mol is None:
+continue
+atom = mol.GetAtomWithIdx(idx)
+# Assign the protonation charge, with special care for Nitrogens
+element = atom.GetAtomicNum()
+if element == 7:
+atom.SetFormalCharge(nitro_charge)
+else:
+atom.SetFormalCharge(charge)
+# Convert back to SMILE and add to output
+out_smile = Chem.MolToSmiles(mol, isomericSmiles=True,canonical=True)
+output.append(out_smile)
+return output
+class ProtectUnprotectFuncs:
+"""A namespace for storing functions that are useful for protecting and
+unprotecting molecules. To keep things organized. We need to identify and
+mark groups that have been matched with a substructure."""
+@staticmethod
+def unprotect_molecule(mol):
+"""Sets the protected property on all atoms to 0. This also creates the
+property for new molecules.
+:param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object.
+:type mol: The rdkit Mol object with atoms unprotected.
+"""
+for atom in mol.GetAtoms():
+atom.SetProp('_protected', '0')
+@staticmethod
+def protect_molecule(mol, match):
+"""Given a 'match', a list of molecules idx's, we set the protected status
+of each atom to 1. This will prevent any matches using that atom in the
+future.
+:param rdkit.Chem.rdchem.Mol mol: The rdkit Mol object to protect.
+:param list match: A list of molecule idx's.
+"""
+for idx in match:
+atom = mol.GetAtomWithIdx(idx)
+atom.SetProp('_protected', '1')
+@staticmethod
+def get_unprotected_matches(mol, substruct):
+"""Finds substructure matches with atoms that have not been protected.
+Returns list of matches, each match a list of atom idxs.
+:param rdkit.Chem.rdchem.Mol mol: The Mol object to consider.
+:param string substruct: The SMARTS string of the substructure ot match.
+:return: A list of the matches. Each match is itself a list of atom idxs.
+"""
+matches = mol.GetSubstructMatches(substruct)
+unprotected_matches = []
+for match in matches:
+if ProtectUnprotectFuncs.is_match_unprotected(mol, match):
+unprotected_matches.append(match)
+return unprotected_matches
+@staticmethod
+def is_match_unprotected(mol, match):
+"""Checks a molecule to see if the substructure match contains any
+protected atoms.
+:param rdkit.Chem.rdchem.Mol mol: The Mol object to check.
+:param list match: The match to check.
+:return: A boolean, whether the match is present or not.
+"""
+for idx in match:
+atom = mol.GetAtomWithIdx(idx)
+protected = atom.GetProp("_protected")
+if protected == "1":
+return False
+return True
+class TestFuncs:
+"""A namespace for storing functions that perform tests on the code. To
+keep things organized."""
+@staticmethod
+def test():
+"""Tests all the 38 groups."""
+smis = [
+# [input smiles, pka, protonated, deprotonated, category]
+["C#CCO",                  "C#CCO",                     "C#CC[O-]",                 "Alcohol"],
+["C(=O)N",                 "NC=O",                      "[NH-]C=O",                 "Amide"],
+["CC(=O)NOC(C)=O",         "CC(=O)NOC(C)=O",            "CC(=O)[N-]OC(C)=O",        "Amide_electronegative"],
+["COC(=N)N",               "COC(N)=[NH2+]",             "COC(=N)N",                 "AmidineGuanidine2"],
+["Brc1ccc(C2NCCS2)cc1",    "Brc1ccc(C2[NH2+]CCS2)cc1",  "Brc1ccc(C2NCCS2)cc1",      "Amines_primary_secondary_tertiary"],
+["CC(=O)[n+]1ccc(N)cc1",   "CC(=O)[n+]1ccc([NH3+])cc1", "CC(=O)[n+]1ccc(N)cc1",     "Anilines_primary"],
+["CCNc1ccccc1",            "CC[NH2+]c1ccccc1",          "CCNc1ccccc1",              "Anilines_secondary"],
+["Cc1ccccc1N(C)C",         "Cc1ccccc1[NH+](C)C",        "Cc1ccccc1N(C)C",           "Anilines_tertiary"],
+["BrC1=CC2=C(C=C1)NC=C2",  "Brc1ccc2[nH]ccc2c1",        "Brc1ccc2[n-]ccc2c1",       "Indole_pyrrole"],
+["O=c1cc[nH]cc1",          "O=c1cc[nH]cc1",             "O=c1cc[n-]cc1",            "Aromatic_nitrogen_protonated"],
+["C-N=[N+]=[N@H]",         "CN=[N+]=N",                 "CN=[N+]=[N-]",             "Azide"],
+["BrC(C(O)=O)CBr",         "O=C(O)C(Br)CBr",            "O=C([O-])C(Br)CBr",        "Carboxyl"],
+["NC(NN=O)=N",             "NC(=[NH2+])NN=O",           "N=C(N)NN=O",               "AmidineGuanidine1"],
+["C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F",      "CC(=O)[N-]C(=O)C(F)(F)F",  "Imide"],
+["O=C(C)NC(C)=O",          "CC(=O)NC(C)=O",             "CC(=O)[N-]C(C)=O",         "Imide2"],
+["CC(C)(C)C(N(C)O)=O",     "CN(O)C(=O)C(C)(C)C",        "CN([O-])C(=O)C(C)(C)C",    "N-hydroxyamide"],
+["C[N+](O)=O",             "C[N+](=O)O",                "C[N+](=O)[O-]",            "Nitro"],
+["O=C1C=C(O)CC1",          "O=C1C=C(O)CC1",             "O=C1C=C([O-])CC1",         "O=C-C=C-OH"],
+["C1CC1OO",                "OOC1CC1",                   "[O-]OC1CC1",               "Peroxide2"],
+["C(=O)OO",                "O=COO",                     "O=CO[O-]",                 "Peroxide1"],
+["Brc1cc(O)cc(Br)c1",      "Oc1cc(Br)cc(Br)c1",         "[O-]c1cc(Br)cc(Br)c1",     "Phenol"],
+["CC(=O)c1ccc(S)cc1",      "CC(=O)c1ccc(S)cc1",         "CC(=O)c1ccc([S-])cc1",     "Phenyl_Thiol"],
+["C=CCOc1ccc(C(=O)O)cc1",  "C=CCOc1ccc(C(=O)O)cc1",     "C=CCOc1ccc(C(=O)[O-])cc1", "Phenyl_carboxyl"],
+["COP(=O)(O)OC",           "COP(=O)(O)OC",              "COP(=O)([O-])OC",          "Phosphate_diester"],
+["CP(C)(=O)O",             "CP(C)(=O)O",                "CP(C)(=O)[O-]",            "Phosphinic_acid"],
+["CC(C)OP(C)(=O)O",        "CC(C)OP(C)(=O)O",           "CC(C)OP(C)(=O)[O-]",       "Phosphonate_ester"],
+["CC1(C)OC(=O)NC1=O",      "CC1(C)OC(=O)NC1=O",         "CC1(C)OC(=O)[N-]C1=O",     "Ringed_imide1"],
+["O=C(N1)C=CC1=O",         "O=C1C=CC(=O)N1",            "O=C1C=CC(=O)[N-]1",        "Ringed_imide2"],
+["O=S(OC)(O)=O",           "COS(=O)(=O)O",              "COS(=O)(=O)[O-]",          "Sulfate"],
+["COc1ccc(S(=O)O)cc1",     "COc1ccc(S(=O)O)cc1",        "COc1ccc(S(=O)[O-])cc1",    "Sulfinic_acid"],
+["CS(N)(=O)=O",            "CS(N)(=O)=O",               "CS([NH-])(=O)=O",          "Sulfonamide"],
+["CC(=O)CSCCS(O)(=O)=O",   "CC(=O)CSCCS(=O)(=O)O",      "CC(=O)CSCCS(=O)(=O)[O-]",  "Sulfonate"],
+["CC(=O)S",                "CC(=O)S",                   "CC(=O)[S-]",               "Thioic_acid"],
+["C(C)(C)(C)(S)",          "CC(C)(C)S",                 "CC(C)(C)[S-]",             "Thiol"],
+["Brc1cc[nH+]cc1",         "Brc1cc[nH+]cc1",            "Brc1ccncc1",               "Aromatic_nitrogen_unprotonated"],
+["C=C(O)c1c(C)cc(C)cc1C",  "C=C(O)c1c(C)cc(C)cc1C",     "C=C([O-])c1c(C)cc(C)cc1C", "Vinyl_alcohol"],
+["CC(=O)ON",               "CC(=O)O[NH3+]",             "CC(=O)ON",                 "Primary_hydroxyl_amine"]
+]
+smis_phos = [
+["O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)([O-])O", "CCCCOP(=O)([O-])[O-]", "Phosphate"],
+["CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O", "CC(C)P(=O)([O-])O", "CC(C)P(=O)([O-])[O-]", "Phosphonate"]
+]
+# Load the average pKa values.
+average_pkas = {l.split()[0].replace("*", ""):float(l.split()[3]) for l in open("site_substructures.smarts") if l.split()[0] not in ["Phosphate", "Phosphonate"]}
+average_pkas_phos = {l.split()[0].replace("*", ""):[float(l.split()[3]), float(l.split()[6])] for l in open("site_substructures.smarts") if l.split()[0] in ["Phosphate", "Phosphonate"]}
+print("Running Tests")
+print("=============")
+print("")
+print("Very Acidic (pH -10000000)")
+print("--------------------------")
+print("")
+args = {
+"min_ph": -10000000,
+"max_ph": -10000000,
+"pka_precision": 0.5,
+"smiles": "",
+"label_states": True
+}
+for smi, protonated, deprotonated, category in smis:
+args["smiles"] = smi
+TestFuncs.test_check(args, [protonated], ["PROTONATED"])
+for smi, protonated, mix, deprotonated, category in smis_phos:
+args["smiles"] = smi
+TestFuncs.test_check(args, [protonated], ["PROTONATED"])
+args["min_ph"] = 10000000
+args["max_ph"] = 10000000
+print("")
+print("Very Basic (pH 10000000)")
+print("------------------------")
+print("")
+for smi, protonated, deprotonated, category in smis:
+args["smiles"] = smi
+TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
+for smi, protonated, mix, deprotonated, category in smis_phos:
+args["smiles"] = smi
+TestFuncs.test_check(args, [deprotonated], ["DEPROTONATED"])
+print("")
+print("pH is Category pKa")
+print("------------------")
+print("")
+for smi, protonated, deprotonated, category in smis:
+avg_pka = average_pkas[category]
+args["smiles"] = smi
+args["min_ph"] = avg_pka
+args["max_ph"] = avg_pka
+TestFuncs.test_check(args, [protonated, deprotonated], ["BOTH"])
+for smi, protonated, mix, deprotonated, category in smis_phos:
+args["smiles"] = smi
+avg_pka = average_pkas_phos[category][0]
+args["min_ph"] = avg_pka
+args["max_ph"] = avg_pka
+TestFuncs.test_check(args, [mix, protonated], ["BOTH"])
+avg_pka = average_pkas_phos[category][1]
+args["min_ph"] = avg_pka
+args["max_ph"] = avg_pka
+TestFuncs.test_check(args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"])
+avg_pka = 0.5 * (average_pkas_phos[category][0] + average_pkas_phos[category][1])
+args["min_ph"] = avg_pka
+args["max_ph"] = avg_pka
+args["pka_precision"] = 5  # Should give all three
+TestFuncs.test_check(args, [mix, deprotonated, protonated], ["BOTH", "BOTH"])
+@staticmethod
+def test_check(args, expected_output, labels):
+"""Tests most ionizable groups. The ones that can only loose or gain a single proton.
+:param args: The arguments to pass to protonate()
+:param expected_output: A list of the expected SMILES-strings output.
+:param labels: The labels. A list containing combo of BOTH, PROTONATED,
+DEPROTONATED.
+:raises Exception: Wrong number of states produced.
+:raises Exception: Unexpected output SMILES.
+:raises Exception: Wrong labels.
+"""
+output = list(Protonate(args))
+output = [o.split() for o in output]
+num_states = len(expected_output)
+if (len(output) != num_states):
+msg = args["smiles"] + " should have " + str(num_states) + \
+" states at at pH " + str(args["min_ph"]) + ": " + str(output)
+print(msg)
+raise Exception(msg)
+if (len(set([l[0] for l in output]) - set(expected_output)) != 0):
+msg = args["smiles"] + " is not " + " AND ".join(expected_output) + \
+" at pH " + str(args["min_ph"]) + " - " + str(args["max_ph"]) + \
+"; it is " + " AND ".join([l[0] for l in output])
+print(msg)
+raise Exception(msg)
+if (len(set([l[1] for l in output]) - set(labels)) != 0):
+msg = args["smiles"] + " not labeled as " + " AND ".join(labels) + \
+"; it is " + " AND ".join([l[1] for l in output])
+print(msg)
+raise Exception(msg)
+ph_range = sorted(list(set([args["min_ph"], args["max_ph"]])))
+ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")"
+print("(CORRECT) " + ph_range_str.ljust(10) + " " + args["smiles"] + " => " + " AND ".join([l[0] for l in output]))
+def run(**kwargs):
+"""A helpful, importable function for those who want to call Dimorphite-DL
+from another Python script rather than the command line. Note that this
+function accepts keyword arguments that match the command-line parameters
+exactly. If you want to pass and return a list of RDKit Mol objects, import
+run_with_mol_list() instead.
+:param **kwargs: For a complete description, run dimorphite_dl.py from the
+command line with the -h option.
+:type kwargs: dict
+"""
+# Run the main function with the specified arguments.
+main(kwargs)
+def run_with_mol_list(mol_lst, **kwargs):
+"""A helpful, importable function for those who want to call Dimorphite-DL
+from another Python script rather than the command line. Note that this
+function is for passing Dimorphite-DL a list of RDKit Mol objects, together
+with command-line parameters. If you want to use only the same parameters
+that you would use from the command line, import run() instead.
+:param mol_lst: A list of rdkit.Chem.rdchem.Mol objects.
+:type mol_lst: list
+:raises Exception: If the **kwargs includes "smiles", "smiles_file",
+"output_file", or "test" parameters.
+:return: A list of properly protonated rdkit.Chem.rdchem.Mol objects.
+:rtype: list
+"""
+# Do a quick check to make sure the user input makes sense.
+for bad_arg in ["smiles", "smiles_file", "output_file", "test"]:
+if bad_arg in kwargs:
+msg = "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + \
+"**kwargs) function, but you also passed the \"" + \
+bad_arg + "\" argument. Did you mean to use the " + \
+"run(**kwargs) function instead?"
+print(msg)
+raise Exception(msg)
+# Set the return_as_list flag so main() will return the protonated smiles
+# as a list.
+kwargs["return_as_list"] = True
+# Having reviewed the code, it will be very difficult to rewrite it so
+# that a list of Mol objects can be used directly. Intead, convert this
+# list of mols to smiles and pass that. Not efficient, but it will work.
+protonated_smiles_and_props = []
+for m in mol_lst:
+props = m.GetPropsAsDict()
+kwargs["smiles"] = Chem.MolToSmiles(m, isomericSmiles=True)
+protonated_smiles_and_props.extend(
+[(s.split("\t")[0], props) for s in main(kwargs)]
+)
+# Now convert the list of protonated smiles strings back to RDKit Mol
+# objects. Also, add back in the properties from the original mol objects.
+mols = []
+for s, props in protonated_smiles_and_props:
+m = Chem.MolFromSmiles(s)
+if m:
+for prop, val in props.items():
+if type(val) is int:
+m.SetIntProp(prop, val)
+elif type(val) is float:
+m.SetDoubleProp(prop, val)
+elif type(val) is bool:
+m.SetBoolProp(prop, val)
+else:
+m.SetProp(prop, str(val))
+mols.append(m)
+else:
+UtilFuncs.eprint("WARNING: Could not process molecule with SMILES string " + s + " and properties " + str(props))
+return mols
+if __name__ == "__main__":
+main()

Mercurial > repos > bgruening > ctb_rdkit_descriptors

comparison dimorphite_dl.py @ 4:414edd9ea77d draft