# HG changeset patch
# User bgruening
# Date 1638635945 0
# Node ID 4beb3e026bbb3cc245ce31610aa2b9f605cffa71
# Parent  351fbd750a6d20bd17ea4eeae09a9d92ccadfd0d
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/rdkit commit c1d813d3f0fec60ea6efe8a11e59d98bfdc1636f"

diff -r 351fbd750a6d -r 4beb3e026bbb dimorphite_dl.py
--- a/dimorphite_dl.py	Wed Feb 17 13:00:12 2021 +0000
+++ b/dimorphite_dl.py	Sat Dec 04 16:39:05 2021 +0000
@@ -19,8 +19,9 @@
 """
 
 from __future__ import print_function
+
+import argparse
 import os
-import argparse
 import sys
 
 try:
@@ -43,11 +44,12 @@
     import rdkit
     from rdkit import Chem
     from rdkit.Chem import AllChem
-except:
+except Exception:
     msg = "Dimorphite-DL requires RDKit. See https://www.rdkit.org/"
     print(msg)
     raise Exception(msg)
 
+
 def main(params=None):
     """The main definition run when you call the script from the commandline.
 
@@ -84,13 +86,14 @@
             with open(args["output_file"], "w") as file:
                 for protonated_smi in Protonate(args):
                     file.write(protonated_smi + "\n")
-        elif "return_as_list" in args and args["return_as_list"] == True:
+        elif "return_as_list" in args and args["return_as_list"]:
             return list(Protonate(args))
         else:
             # No output file specified. Just print it to the screen.
             for protonated_smi in Protonate(args):
                 print(protonated_smi)
 
+
 class MyParser(argparse.ArgumentParser):
     """Overwrite default parse so it displays help file on error. See
     https://stackoverflow.com/questions/4042452/display-help-message-with-python-argparse-when-script-is-called-without-any-argu"""
@@ -117,15 +120,18 @@
         if file is None:
             file = sys.stdout
         self._print_message(self.format_help(), file)
-        print("""
+        print(
+            """
 examples:
   python dimorphite_dl.py --smiles_file sample_molecules.smi
   python dimorphite_dl.py --smiles "CCC(=O)O" --min_ph -3.0 --max_ph -2.0
   python dimorphite_dl.py --smiles "CCCN" --min_ph -3.0 --max_ph -2.0 --output_file output.smi
   python dimorphite_dl.py --smiles_file sample_molecules.smi --pka_precision 2.0 --label_states
-  python dimorphite_dl.py --test""")
+  python dimorphite_dl.py --test"""
+        )
         print("")
 
+
 class ArgParseFuncs:
     """A namespace for storing functions that are useful for processing
     command-line arguments. To keep things organized."""
@@ -137,27 +143,57 @@
         :return: A parser object.
         """
 
-        parser = MyParser(description="Dimorphite 1.2: Creates models of " +
-                                    "appropriately protonated small moleucles. " +
-                                    "Apache 2.0 License. Copyright 2018 Jacob D. " +
-                                    "Durrant.")
-        parser.add_argument('--min_ph', metavar='MIN', type=float, default=6.4,
-                            help='minimum pH to consider (default: 6.4)')
-        parser.add_argument('--max_ph', metavar='MAX', type=float, default=8.4,
-                            help='maximum pH to consider (default: 8.4)')
-        parser.add_argument('--pka_precision', metavar='PRE', type=float, default=1.0,
-                            help='pKa precision factor (number of standard devations, default: 1.0)')
-        parser.add_argument('--smiles', metavar='SMI', type=str,
-                            help='SMILES string to protonate')
-        parser.add_argument('--smiles_file', metavar="FILE", type=str,
-                            help='file that contains SMILES strings to protonate')
-        parser.add_argument('--output_file', metavar="FILE", type=str,
-                            help='output file to write protonated SMILES (optional)')
-        parser.add_argument('--label_states', action="store_true",
-                            help='label protonated SMILES with target state ' + \
-                                '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").')
-        parser.add_argument('--test', action="store_true",
-                            help='run unit tests (for debugging)')
+        parser = MyParser(
+            description="Dimorphite 1.2: Creates models of "
+            + "appropriately protonated small moleucles. "
+            + "Apache 2.0 License. Copyright 2018 Jacob D. "
+            + "Durrant."
+        )
+        parser.add_argument(
+            "--min_ph",
+            metavar="MIN",
+            type=float,
+            default=6.4,
+            help="minimum pH to consider (default: 6.4)",
+        )
+        parser.add_argument(
+            "--max_ph",
+            metavar="MAX",
+            type=float,
+            default=8.4,
+            help="maximum pH to consider (default: 8.4)",
+        )
+        parser.add_argument(
+            "--pka_precision",
+            metavar="PRE",
+            type=float,
+            default=1.0,
+            help="pKa precision factor (number of standard devations, default: 1.0)",
+        )
+        parser.add_argument(
+            "--smiles", metavar="SMI", type=str, help="SMILES string to protonate"
+        )
+        parser.add_argument(
+            "--smiles_file",
+            metavar="FILE",
+            type=str,
+            help="file that contains SMILES strings to protonate",
+        )
+        parser.add_argument(
+            "--output_file",
+            metavar="FILE",
+            type=str,
+            help="output file to write protonated SMILES (optional)",
+        )
+        parser.add_argument(
+            "--label_states",
+            action="store_true",
+            help="label protonated SMILES with target state "
+            + '(i.e., "DEPROTONATED", "PROTONATED", or "BOTH").',
+        )
+        parser.add_argument(
+            "--test", action="store_true", help="run unit tests (for debugging)"
+        )
 
         return parser
 
@@ -170,11 +206,13 @@
         :raises Exception: No SMILES in params.
         """
 
-        defaults = {'min_ph' : 6.4,
-                    'max_ph' : 8.4,
-                    'pka_precision' : 1.0,
-                    'label_states' : False,
-                    'test' : False}
+        defaults = {
+            "min_ph": 6.4,
+            "max_ph": 8.4,
+            "pka_precision": 1.0,
+            "label_states": False,
+            "test": False,
+        }
 
         for key in defaults:
             if key not in args:
@@ -194,12 +232,13 @@
         # object.
         if "smiles" in args:
             if isinstance(args["smiles"], str):
-                args["smiles_file"]  = StringIO(args["smiles"])
+                args["smiles_file"] = StringIO(args["smiles"])
 
         args["smiles_and_data"] = LoadSMIFile(args["smiles_file"])
 
         return args
 
+
 class UtilFuncs:
     """A namespace to store functions for manipulating mol objects. To keep
     things organized."""
@@ -215,15 +254,33 @@
 
         # Get the reaction data
         rxn_data = [
-            ['[Ov1-1:1]', '[Ov2+0:1]-[H]'],  # To handle O- bonded to only one atom (add hydrogen).
-            ['[#7v4+1:1]-[H]', '[#7v3+0:1]'],  # To handle N+ bonded to a hydrogen (remove hydrogen).
-            ['[Ov2-:1]', '[Ov2+0:1]'],  # To handle O- bonded to two atoms. Should not be Negative.
-            ['[#7v3+1:1]', '[#7v3+0:1]'],  # To handle N+ bonded to three atoms. Should not be positive.
-            ['[#7v2-1:1]', '[#7+0:1]-[H]'],  # To handle N- Bonded to two atoms. Add hydrogen.
+            [
+                "[Ov1-1:1]",
+                "[Ov2+0:1]-[H]",
+            ],  # To handle O- bonded to only one atom (add hydrogen).
+            [
+                "[#7v4+1:1]-[H]",
+                "[#7v3+0:1]",
+            ],  # To handle N+ bonded to a hydrogen (remove hydrogen).
+            [
+                "[Ov2-:1]",
+                "[Ov2+0:1]",
+            ],  # To handle O- bonded to two atoms. Should not be Negative.
+            [
+                "[#7v3+1:1]",
+                "[#7v3+0:1]",
+            ],  # To handle N+ bonded to three atoms. Should not be positive.
+            [
+                "[#7v2-1:1]",
+                "[#7+0:1]-[H]",
+            ],  # To handle N- Bonded to two atoms. Add hydrogen.
             # ['[N:1]=[N+0:2]=[N:3]-[H]', '[N:1]=[N+1:2]=[N+0:3]-[H]'],  # To
             # handle bad azide. Must be protonated. (Now handled elsewhere, before
             # SMILES converted to Mol object.)
-            ['[H]-[N:1]-[N:2]#[N:3]', '[N:1]=[N+1:2]=[N:3]-[H]']  # To handle bad azide. R-N-N#N should be R-N=[N+]=N
+            [
+                "[H]-[N:1]-[N:2]#[N:3]",
+                "[N:1]=[N+1:2]=[N:3]-[H]",
+            ],  # To handle bad azide. R-N-N#N should be R-N=[N+]=N
         ]
 
         # Add substructures and reactions (initially none)
@@ -241,10 +298,15 @@
             current_rxn_str = None
 
             for i, rxn_datum in enumerate(rxn_data):
-                reactant_smarts, product_smarts, substruct_match_mol, rxn_placeholder = rxn_datum
+                (
+                    reactant_smarts,
+                    product_smarts,
+                    substruct_match_mol,
+                    rxn_placeholder,
+                ) = rxn_datum
                 if mol.HasSubstructMatch(substruct_match_mol):
                     if rxn_placeholder is None:
-                        current_rxn_str = reactant_smarts + '>>' + product_smarts
+                        current_rxn_str = reactant_smarts + ">>" + product_smarts
                         current_rxn = AllChem.ReactionFromSmarts(current_rxn_str)
                         rxn_data[i][3] = current_rxn  # Update the placeholder.
                     else:
@@ -262,10 +324,10 @@
         # to resanitize them. Make sure aromatic rings are shown as such This
         # catches all RDKit Errors. without the catchError and sanitizeOps the
         # Chem.SanitizeMol can crash the program.
-        sanitize_string =  Chem.SanitizeMol(
+        sanitize_string = Chem.SanitizeMol(
             mol,
             sanitizeOps=rdkit.Chem.rdmolops.SanitizeFlags.SANITIZE_ALL,
-            catchErrors = True
+            catchErrors=True,
         )
 
         return mol if sanitize_string.name == "SANITIZE_NONE" else None
@@ -321,6 +383,7 @@
 
         print(*args, file=sys.stderr, **kwargs)
 
+
 class LoadSMIFile(object):
     """A generator class for loading in the SMILES strings from a file, one at
     a time."""
@@ -388,37 +451,43 @@
             # into a canonical form. Filter if failed.
             mol = UtilFuncs.convert_smiles_str_to_mol(smiles_str)
             if mol is None:
-                UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+                UtilFuncs.eprint(
+                    "WARNING: Skipping poorly formed SMILES string: " + line
+                )
                 return self.next()
 
             # Handle nuetralizing the molecules. Filter if failed.
             mol = UtilFuncs.neutralize_mol(mol)
             if mol is None:
-                UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+                UtilFuncs.eprint(
+                    "WARNING: Skipping poorly formed SMILES string: " + line
+                )
                 return self.next()
 
             # Remove the hydrogens.
             try:
                 mol = Chem.RemoveHs(mol)
-            except:
-                UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+            except Exception:
+                UtilFuncs.eprint(
+                    "WARNING: Skipping poorly formed SMILES string: " + line
+                )
                 return self.next()
 
             if mol is None:
-                UtilFuncs.eprint("WARNING: Skipping poorly formed SMILES string: " + line)
+                UtilFuncs.eprint(
+                    "WARNING: Skipping poorly formed SMILES string: " + line
+                )
                 return self.next()
 
             # Regenerate the smiles string (to standardize).
             new_mol_string = Chem.MolToSmiles(mol, isomericSmiles=True)
 
-            return {
-                "smiles": new_mol_string,
-                "data": splits[1:]
-            }
+            return {"smiles": new_mol_string, "data": splits[1:]}
         else:
             # Blank line? Go to next one.
             return self.next()
 
+
 class Protonate(object):
     """A generator class for protonating SMILES strings, one at a time."""
 
@@ -491,8 +560,8 @@
 
         smi = smile_and_datum["smiles"]
         data = smile_and_datum["data"]  # Everything on SMILES line but the
-                                        # SMILES string itself (e.g., the
-                                        # molecule name).
+        # SMILES string itself (e.g., the
+        # molecule name).
 
         # Collect the data associated with this smiles (e.g., the molecule
         # name).
@@ -516,8 +585,8 @@
 
             # Only add new smiles if not already in the list.
             # for s in new_smis_to_perhaps_add:
-                # if not s in new_smis:
-                    # new_smis.append(s)
+            # if not s in new_smis:
+            # new_smis.append(s)
 
         # In some cases, the script might generate redundant molecules.
         # Phosphonates, when the pH is between the two pKa values and the
@@ -532,7 +601,9 @@
         # Sometimes Dimorphite-DL generates molecules that aren't actually
         # possible. Simply convert these to mol objects to eliminate the bad
         # ones (that are None).
-        new_smis = [s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None]
+        new_smis = [
+            s for s in new_smis if UtilFuncs.convert_smiles_str_to_mol(s) is not None
+        ]
 
         # If there are no smi left, return the input one at the very least.
         # All generated forms have apparently been judged
@@ -543,7 +614,7 @@
         # If the user wants to see the target states, add those
         # to the ends of each line.
         if self.args["label_states"]:
-            states = '\t'.join([x[1] for x in sites])
+            states = "\t".join([x[1] for x in sites])
             new_lines = [x + "\t" + tag + "\t" + states for x in new_smis]
         else:
             new_lines = [x + "\t" + tag for x in new_smis]
@@ -552,12 +623,15 @@
 
         return self.next()
 
+
 class ProtSubstructFuncs:
     """A namespace to store functions for loading the substructures that can
     be protonated. To keep things organized."""
 
     @staticmethod
-    def load_protonation_substructs_calc_state_for_ph(min_ph=6.4, max_ph=8.4, pka_std_range=1):
+    def load_protonation_substructs_calc_state_for_ph(
+        min_ph=6.4, max_ph=8.4, pka_std_range=1
+    ):
         """A pre-calculated list of R-groups with protonation sites, with their
         likely pKa bins.
 
@@ -573,7 +647,7 @@
         pwd = os.path.dirname(os.path.realpath(__file__))
 
         site_structures_file = "{}/{}".format(pwd, "site_substructures.smarts")
-        with open(site_structures_file, 'r') as substruct:
+        with open(site_structures_file, "r") as substruct:
             for line in substruct:
                 line = line.strip()
                 sub = {}
@@ -584,7 +658,9 @@
                     sub["mol"] = Chem.MolFromSmarts(sub["smart"])
 
                     # NEED TO DIVIDE THIS BY 3s
-                    pka_ranges = [splits[i:i+3] for i in range(2, len(splits)-1, 3)]
+                    pka_ranges = [
+                        splits[i : i + 3] for i in range(2, len(splits) - 1, 3)
+                    ]
 
                     prot = []
                     for pka_range in pka_ranges:
@@ -620,11 +696,11 @@
         # This needs to be reassigned, and 'ERROR' should never make it past the
         # next set of checks.
         if min_pka <= max_ph and min_ph <= max_pka:
-            protonation_state = 'BOTH'
+            protonation_state = "BOTH"
         elif mean > max_ph:
-            protonation_state = 'PROTONATED'
+            protonation_state = "PROTONATED"
         else:
-            protonation_state = 'DEPROTONATED'
+            protonation_state = "DEPROTONATED"
 
         return protonation_state
 
@@ -650,8 +726,8 @@
 
         # Try to Add hydrogens. if failed return []
         try:
-            mol =  Chem.AddHs(mol)
-        except:
+            mol = Chem.AddHs(mol)
+        except Exception:
             UtilFuncs.eprint("ERROR:   ", smi)
             return []
 
@@ -701,14 +777,14 @@
         # Initialize the output list
         output_smis = []
 
-        state_to_charge = {"DEPROTONATED": [-1],
-                        "PROTONATED": [0],
-                        "BOTH": [-1, 0]}
+        state_to_charge = {"DEPROTONATED": [-1], "PROTONATED": [0], "BOTH": [-1, 0]}
 
         charges = state_to_charge[target_prot_state]
 
         # Now make the actual smiles match the target protonation state.
-        output_smis = ProtSubstructFuncs.set_protonation_charge(smis, idx, charges, prot_site_name)
+        output_smis = ProtSubstructFuncs.set_protonation_charge(
+            smis, idx, charges, prot_site_name
+        )
 
         return output_smis
 
@@ -759,11 +835,12 @@
                     atom.SetFormalCharge(charge)
 
                 # Convert back to SMILE and add to output
-                out_smile = Chem.MolToSmiles(mol, isomericSmiles=True,canonical=True)
+                out_smile = Chem.MolToSmiles(mol, isomericSmiles=True, canonical=True)
                 output.append(out_smile)
 
         return output
 
+
 class ProtectUnprotectFuncs:
     """A namespace for storing functions that are useful for protecting and
     unprotecting molecules. To keep things organized. We need to identify and
@@ -779,7 +856,7 @@
         """
 
         for atom in mol.GetAtoms():
-            atom.SetProp('_protected', '0')
+            atom.SetProp("_protected", "0")
 
     @staticmethod
     def protect_molecule(mol, match):
@@ -793,7 +870,7 @@
 
         for idx in match:
             atom = mol.GetAtomWithIdx(idx)
-            atom.SetProp('_protected', '1')
+            atom.SetProp("_protected", "1")
 
     @staticmethod
     def get_unprotected_matches(mol, substruct):
@@ -829,6 +906,7 @@
                 return False
         return True
 
+
 class TestFuncs:
     """A namespace for storing functions that perform tests on the code. To
     keep things organized."""
@@ -839,53 +917,158 @@
 
         smis = [
             # [input smiles, pka, protonated, deprotonated, category]
-            ["C#CCO",                  "C#CCO",                     "C#CC[O-]",                 "Alcohol"],
-            ["C(=O)N",                 "NC=O",                      "[NH-]C=O",                 "Amide"],
-            ["CC(=O)NOC(C)=O",         "CC(=O)NOC(C)=O",            "CC(=O)[N-]OC(C)=O",        "Amide_electronegative"],
-            ["COC(=N)N",               "COC(N)=[NH2+]",             "COC(=N)N",                 "AmidineGuanidine2"],
-            ["Brc1ccc(C2NCCS2)cc1",    "Brc1ccc(C2[NH2+]CCS2)cc1",  "Brc1ccc(C2NCCS2)cc1",      "Amines_primary_secondary_tertiary"],
-            ["CC(=O)[n+]1ccc(N)cc1",   "CC(=O)[n+]1ccc([NH3+])cc1", "CC(=O)[n+]1ccc(N)cc1",     "Anilines_primary"],
-            ["CCNc1ccccc1",            "CC[NH2+]c1ccccc1",          "CCNc1ccccc1",              "Anilines_secondary"],
-            ["Cc1ccccc1N(C)C",         "Cc1ccccc1[NH+](C)C",        "Cc1ccccc1N(C)C",           "Anilines_tertiary"],
-            ["BrC1=CC2=C(C=C1)NC=C2",  "Brc1ccc2[nH]ccc2c1",        "Brc1ccc2[n-]ccc2c1",       "Indole_pyrrole"],
-            ["O=c1cc[nH]cc1",          "O=c1cc[nH]cc1",             "O=c1cc[n-]cc1",            "Aromatic_nitrogen_protonated"],
-            ["C-N=[N+]=[N@H]",         "CN=[N+]=N",                 "CN=[N+]=[N-]",             "Azide"],
-            ["BrC(C(O)=O)CBr",         "O=C(O)C(Br)CBr",            "O=C([O-])C(Br)CBr",        "Carboxyl"],
-            ["NC(NN=O)=N",             "NC(=[NH2+])NN=O",           "N=C(N)NN=O",               "AmidineGuanidine1"],
-            ["C(F)(F)(F)C(=O)NC(=O)C", "CC(=O)NC(=O)C(F)(F)F",      "CC(=O)[N-]C(=O)C(F)(F)F",  "Imide"],
-            ["O=C(C)NC(C)=O",          "CC(=O)NC(C)=O",             "CC(=O)[N-]C(C)=O",         "Imide2"],
-            ["CC(C)(C)C(N(C)O)=O",     "CN(O)C(=O)C(C)(C)C",        "CN([O-])C(=O)C(C)(C)C",    "N-hydroxyamide"],
-            ["C[N+](O)=O",             "C[N+](=O)O",                "C[N+](=O)[O-]",            "Nitro"],
-            ["O=C1C=C(O)CC1",          "O=C1C=C(O)CC1",             "O=C1C=C([O-])CC1",         "O=C-C=C-OH"],
-            ["C1CC1OO",                "OOC1CC1",                   "[O-]OC1CC1",               "Peroxide2"],
-            ["C(=O)OO",                "O=COO",                     "O=CO[O-]",                 "Peroxide1"],
-            ["Brc1cc(O)cc(Br)c1",      "Oc1cc(Br)cc(Br)c1",         "[O-]c1cc(Br)cc(Br)c1",     "Phenol"],
-            ["CC(=O)c1ccc(S)cc1",      "CC(=O)c1ccc(S)cc1",         "CC(=O)c1ccc([S-])cc1",     "Phenyl_Thiol"],
-            ["C=CCOc1ccc(C(=O)O)cc1",  "C=CCOc1ccc(C(=O)O)cc1",     "C=CCOc1ccc(C(=O)[O-])cc1", "Phenyl_carboxyl"],
-            ["COP(=O)(O)OC",           "COP(=O)(O)OC",              "COP(=O)([O-])OC",          "Phosphate_diester"],
-            ["CP(C)(=O)O",             "CP(C)(=O)O",                "CP(C)(=O)[O-]",            "Phosphinic_acid"],
-            ["CC(C)OP(C)(=O)O",        "CC(C)OP(C)(=O)O",           "CC(C)OP(C)(=O)[O-]",       "Phosphonate_ester"],
-            ["CC1(C)OC(=O)NC1=O",      "CC1(C)OC(=O)NC1=O",         "CC1(C)OC(=O)[N-]C1=O",     "Ringed_imide1"],
-            ["O=C(N1)C=CC1=O",         "O=C1C=CC(=O)N1",            "O=C1C=CC(=O)[N-]1",        "Ringed_imide2"],
-            ["O=S(OC)(O)=O",           "COS(=O)(=O)O",              "COS(=O)(=O)[O-]",          "Sulfate"],
-            ["COc1ccc(S(=O)O)cc1",     "COc1ccc(S(=O)O)cc1",        "COc1ccc(S(=O)[O-])cc1",    "Sulfinic_acid"],
-            ["CS(N)(=O)=O",            "CS(N)(=O)=O",               "CS([NH-])(=O)=O",          "Sulfonamide"],
-            ["CC(=O)CSCCS(O)(=O)=O",   "CC(=O)CSCCS(=O)(=O)O",      "CC(=O)CSCCS(=O)(=O)[O-]",  "Sulfonate"],
-            ["CC(=O)S",                "CC(=O)S",                   "CC(=O)[S-]",               "Thioic_acid"],
-            ["C(C)(C)(C)(S)",          "CC(C)(C)S",                 "CC(C)(C)[S-]",             "Thiol"],
-            ["Brc1cc[nH+]cc1",         "Brc1cc[nH+]cc1",            "Brc1ccncc1",               "Aromatic_nitrogen_unprotonated"],
-            ["C=C(O)c1c(C)cc(C)cc1C",  "C=C(O)c1c(C)cc(C)cc1C",     "C=C([O-])c1c(C)cc(C)cc1C", "Vinyl_alcohol"],
-            ["CC(=O)ON",               "CC(=O)O[NH3+]",             "CC(=O)ON",                 "Primary_hydroxyl_amine"]
+            ["C#CCO", "C#CCO", "C#CC[O-]", "Alcohol"],
+            ["C(=O)N", "NC=O", "[NH-]C=O", "Amide"],
+            [
+                "CC(=O)NOC(C)=O",
+                "CC(=O)NOC(C)=O",
+                "CC(=O)[N-]OC(C)=O",
+                "Amide_electronegative",
+            ],
+            ["COC(=N)N", "COC(N)=[NH2+]", "COC(=N)N", "AmidineGuanidine2"],
+            [
+                "Brc1ccc(C2NCCS2)cc1",
+                "Brc1ccc(C2[NH2+]CCS2)cc1",
+                "Brc1ccc(C2NCCS2)cc1",
+                "Amines_primary_secondary_tertiary",
+            ],
+            [
+                "CC(=O)[n+]1ccc(N)cc1",
+                "CC(=O)[n+]1ccc([NH3+])cc1",
+                "CC(=O)[n+]1ccc(N)cc1",
+                "Anilines_primary",
+            ],
+            ["CCNc1ccccc1", "CC[NH2+]c1ccccc1", "CCNc1ccccc1", "Anilines_secondary"],
+            [
+                "Cc1ccccc1N(C)C",
+                "Cc1ccccc1[NH+](C)C",
+                "Cc1ccccc1N(C)C",
+                "Anilines_tertiary",
+            ],
+            [
+                "BrC1=CC2=C(C=C1)NC=C2",
+                "Brc1ccc2[nH]ccc2c1",
+                "Brc1ccc2[n-]ccc2c1",
+                "Indole_pyrrole",
+            ],
+            [
+                "O=c1cc[nH]cc1",
+                "O=c1cc[nH]cc1",
+                "O=c1cc[n-]cc1",
+                "Aromatic_nitrogen_protonated",
+            ],
+            ["C-N=[N+]=[N@H]", "CN=[N+]=N", "CN=[N+]=[N-]", "Azide"],
+            ["BrC(C(O)=O)CBr", "O=C(O)C(Br)CBr", "O=C([O-])C(Br)CBr", "Carboxyl"],
+            ["NC(NN=O)=N", "NC(=[NH2+])NN=O", "N=C(N)NN=O", "AmidineGuanidine1"],
+            [
+                "C(F)(F)(F)C(=O)NC(=O)C",
+                "CC(=O)NC(=O)C(F)(F)F",
+                "CC(=O)[N-]C(=O)C(F)(F)F",
+                "Imide",
+            ],
+            ["O=C(C)NC(C)=O", "CC(=O)NC(C)=O", "CC(=O)[N-]C(C)=O", "Imide2"],
+            [
+                "CC(C)(C)C(N(C)O)=O",
+                "CN(O)C(=O)C(C)(C)C",
+                "CN([O-])C(=O)C(C)(C)C",
+                "N-hydroxyamide",
+            ],
+            ["C[N+](O)=O", "C[N+](=O)O", "C[N+](=O)[O-]", "Nitro"],
+            ["O=C1C=C(O)CC1", "O=C1C=C(O)CC1", "O=C1C=C([O-])CC1", "O=C-C=C-OH"],
+            ["C1CC1OO", "OOC1CC1", "[O-]OC1CC1", "Peroxide2"],
+            ["C(=O)OO", "O=COO", "O=CO[O-]", "Peroxide1"],
+            [
+                "Brc1cc(O)cc(Br)c1",
+                "Oc1cc(Br)cc(Br)c1",
+                "[O-]c1cc(Br)cc(Br)c1",
+                "Phenol",
+            ],
+            [
+                "CC(=O)c1ccc(S)cc1",
+                "CC(=O)c1ccc(S)cc1",
+                "CC(=O)c1ccc([S-])cc1",
+                "Phenyl_Thiol",
+            ],
+            [
+                "C=CCOc1ccc(C(=O)O)cc1",
+                "C=CCOc1ccc(C(=O)O)cc1",
+                "C=CCOc1ccc(C(=O)[O-])cc1",
+                "Phenyl_carboxyl",
+            ],
+            ["COP(=O)(O)OC", "COP(=O)(O)OC", "COP(=O)([O-])OC", "Phosphate_diester"],
+            ["CP(C)(=O)O", "CP(C)(=O)O", "CP(C)(=O)[O-]", "Phosphinic_acid"],
+            [
+                "CC(C)OP(C)(=O)O",
+                "CC(C)OP(C)(=O)O",
+                "CC(C)OP(C)(=O)[O-]",
+                "Phosphonate_ester",
+            ],
+            [
+                "CC1(C)OC(=O)NC1=O",
+                "CC1(C)OC(=O)NC1=O",
+                "CC1(C)OC(=O)[N-]C1=O",
+                "Ringed_imide1",
+            ],
+            ["O=C(N1)C=CC1=O", "O=C1C=CC(=O)N1", "O=C1C=CC(=O)[N-]1", "Ringed_imide2"],
+            ["O=S(OC)(O)=O", "COS(=O)(=O)O", "COS(=O)(=O)[O-]", "Sulfate"],
+            [
+                "COc1ccc(S(=O)O)cc1",
+                "COc1ccc(S(=O)O)cc1",
+                "COc1ccc(S(=O)[O-])cc1",
+                "Sulfinic_acid",
+            ],
+            ["CS(N)(=O)=O", "CS(N)(=O)=O", "CS([NH-])(=O)=O", "Sulfonamide"],
+            [
+                "CC(=O)CSCCS(O)(=O)=O",
+                "CC(=O)CSCCS(=O)(=O)O",
+                "CC(=O)CSCCS(=O)(=O)[O-]",
+                "Sulfonate",
+            ],
+            ["CC(=O)S", "CC(=O)S", "CC(=O)[S-]", "Thioic_acid"],
+            ["C(C)(C)(C)(S)", "CC(C)(C)S", "CC(C)(C)[S-]", "Thiol"],
+            [
+                "Brc1cc[nH+]cc1",
+                "Brc1cc[nH+]cc1",
+                "Brc1ccncc1",
+                "Aromatic_nitrogen_unprotonated",
+            ],
+            [
+                "C=C(O)c1c(C)cc(C)cc1C",
+                "C=C(O)c1c(C)cc(C)cc1C",
+                "C=C([O-])c1c(C)cc(C)cc1C",
+                "Vinyl_alcohol",
+            ],
+            ["CC(=O)ON", "CC(=O)O[NH3+]", "CC(=O)ON", "Primary_hydroxyl_amine"],
         ]
 
         smis_phos = [
-            ["O=P(O)(O)OCCCC", "CCCCOP(=O)(O)O", "CCCCOP(=O)([O-])O", "CCCCOP(=O)([O-])[O-]", "Phosphate"],
-            ["CC(P(O)(O)=O)C", "CC(C)P(=O)(O)O", "CC(C)P(=O)([O-])O", "CC(C)P(=O)([O-])[O-]", "Phosphonate"]
+            [
+                "O=P(O)(O)OCCCC",
+                "CCCCOP(=O)(O)O",
+                "CCCCOP(=O)([O-])O",
+                "CCCCOP(=O)([O-])[O-]",
+                "Phosphate",
+            ],
+            [
+                "CC(P(O)(O)=O)C",
+                "CC(C)P(=O)(O)O",
+                "CC(C)P(=O)([O-])O",
+                "CC(C)P(=O)([O-])[O-]",
+                "Phosphonate",
+            ],
         ]
 
         # Load the average pKa values.
-        average_pkas = {l.split()[0].replace("*", ""):float(l.split()[3]) for l in open("site_substructures.smarts") if l.split()[0] not in ["Phosphate", "Phosphonate"]}
-        average_pkas_phos = {l.split()[0].replace("*", ""):[float(l.split()[3]), float(l.split()[6])] for l in open("site_substructures.smarts") if l.split()[0] in ["Phosphate", "Phosphonate"]}
+        average_pkas = {
+            l.split()[0].replace("*", ""): float(l.split()[3])
+            for l in open("site_substructures.smarts")
+            if l.split()[0] not in ["Phosphate", "Phosphonate"]
+        }
+        average_pkas_phos = {
+            l.split()[0].replace("*", ""): [float(l.split()[3]), float(l.split()[6])]
+            for l in open("site_substructures.smarts")
+            if l.split()[0] in ["Phosphate", "Phosphonate"]
+        }
 
         print("Running Tests")
         print("=============")
@@ -900,7 +1083,7 @@
             "max_ph": -10000000,
             "pka_precision": 0.5,
             "smiles": "",
-            "label_states": True
+            "label_states": True,
         }
 
         for smi, protonated, deprotonated, category in smis:
@@ -954,14 +1137,20 @@
             args["min_ph"] = avg_pka
             args["max_ph"] = avg_pka
 
-            TestFuncs.test_check(args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"])
+            TestFuncs.test_check(
+                args, [mix, deprotonated], ["DEPROTONATED", "DEPROTONATED"]
+            )
 
-            avg_pka = 0.5 * (average_pkas_phos[category][0] + average_pkas_phos[category][1])
+            avg_pka = 0.5 * (
+                average_pkas_phos[category][0] + average_pkas_phos[category][1]
+            )
             args["min_ph"] = avg_pka
             args["max_ph"] = avg_pka
             args["pka_precision"] = 5  # Should give all three
 
-            TestFuncs.test_check(args, [mix, deprotonated, protonated], ["BOTH", "BOTH"])
+            TestFuncs.test_check(
+                args, [mix, deprotonated, protonated], ["BOTH", "BOTH"]
+            )
 
     @staticmethod
     def test_check(args, expected_output, labels):
@@ -981,28 +1170,56 @@
 
         num_states = len(expected_output)
 
-        if (len(output) != num_states):
-            msg = args["smiles"] + " should have " + str(num_states) + \
-                " states at at pH " + str(args["min_ph"]) + ": " + str(output)
+        if len(output) != num_states:
+            msg = (
+                args["smiles"]
+                + " should have "
+                + str(num_states)
+                + " states at at pH "
+                + str(args["min_ph"])
+                + ": "
+                + str(output)
+            )
             print(msg)
             raise Exception(msg)
 
-        if (len(set([l[0] for l in output]) - set(expected_output)) != 0):
-            msg = args["smiles"] + " is not " + " AND ".join(expected_output) + \
-                " at pH " + str(args["min_ph"]) + " - " + str(args["max_ph"]) + \
-                "; it is " + " AND ".join([l[0] for l in output])
+        if len(set([l[0] for l in output]) - set(expected_output)) != 0:
+            msg = (
+                args["smiles"]
+                + " is not "
+                + " AND ".join(expected_output)
+                + " at pH "
+                + str(args["min_ph"])
+                + " - "
+                + str(args["max_ph"])
+                + "; it is "
+                + " AND ".join([l[0] for l in output])
+            )
             print(msg)
             raise Exception(msg)
 
-        if (len(set([l[1] for l in output]) - set(labels)) != 0):
-            msg = args["smiles"] + " not labeled as " + " AND ".join(labels) + \
-                "; it is " + " AND ".join([l[1] for l in output])
+        if len(set([l[1] for l in output]) - set(labels)) != 0:
+            msg = (
+                args["smiles"]
+                + " not labeled as "
+                + " AND ".join(labels)
+                + "; it is "
+                + " AND ".join([l[1] for l in output])
+            )
             print(msg)
             raise Exception(msg)
 
         ph_range = sorted(list(set([args["min_ph"], args["max_ph"]])))
         ph_range_str = "(" + " - ".join("{0:.2f}".format(n) for n in ph_range) + ")"
-        print("(CORRECT) " + ph_range_str.ljust(10) + " " + args["smiles"] + " => " + " AND ".join([l[0] for l in output]))
+        print(
+            "(CORRECT) "
+            + ph_range_str.ljust(10)
+            + " "
+            + args["smiles"]
+            + " => "
+            + " AND ".join([l[0] for l in output])
+        )
+
 
 def run(**kwargs):
     """A helpful, importable function for those who want to call Dimorphite-DL
@@ -1019,6 +1236,7 @@
     # Run the main function with the specified arguments.
     main(kwargs)
 
+
 def run_with_mol_list(mol_lst, **kwargs):
     """A helpful, importable function for those who want to call Dimorphite-DL
     from another Python script rather than the command line. Note that this
@@ -1037,10 +1255,13 @@
     # Do a quick check to make sure the user input makes sense.
     for bad_arg in ["smiles", "smiles_file", "output_file", "test"]:
         if bad_arg in kwargs:
-            msg = "You're using Dimorphite-DL's run_with_mol_list(mol_lst, " + \
-                   "**kwargs) function, but you also passed the \"" + \
-                   bad_arg + "\" argument. Did you mean to use the " + \
-                   "run(**kwargs) function instead?"
+            msg = (
+                "You're using Dimorphite-DL's run_with_mol_list(mol_lst, "
+                + '**kwargs) function, but you also passed the "'
+                + bad_arg
+                + '" argument. Did you mean to use the '
+                + "run(**kwargs) function instead?"
+            )
             print(msg)
             raise Exception(msg)
 
@@ -1076,9 +1297,15 @@
                     m.SetProp(prop, str(val))
             mols.append(m)
         else:
-            UtilFuncs.eprint("WARNING: Could not process molecule with SMILES string " + s + " and properties " + str(props))
+            UtilFuncs.eprint(
+                "WARNING: Could not process molecule with SMILES string "
+                + s
+                + " and properties "
+                + str(props)
+            )
 
     return mols
 
+
 if __name__ == "__main__":
     main()
diff -r 351fbd750a6d -r 4beb3e026bbb rdconf.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/rdconf.py	Sat Dec 04 16:39:05 2021 +0000
@@ -0,0 +1,229 @@
+#!/usr/bin/python3
+
+import gzip
+import os
+import sys
+from optparse import OptionParser
+
+from rdkit.Chem import AllChem as Chem
+
+"""
+This script was originally written by David Koes, University of Pittsburgh:
+https://github.com/dkoes/rdkit-scripts/blob/master/rdconf.py
+It is licensed under the MIT licence.
+
+Given a smiles file, generate 3D conformers in output sdf.
+Energy minimizes and filters conformers to meet energy window and rms constraints.
+
+Some time ago I compared this to alternative conformer generators and
+it was quite competitive (especially after RDKit's UFF implementation
+added OOP terms).
+"""
+
+
+# convert smiles to sdf
+def getRMS(mol, c1, c2):
+    rms = Chem.GetBestRMS(mol, mol, c1, c2)
+    return rms
+
+
+parser = OptionParser(usage="Usage: %prog [options] <input>.smi <output>.sdf")
+parser.add_option(
+    "--maxconfs",
+    dest="maxconfs",
+    action="store",
+    help="maximum number of conformers to generate per a molecule (default 20)",
+    default="20",
+    type="int",
+    metavar="CNT",
+)
+parser.add_option(
+    "--sample_multiplier",
+    dest="sample",
+    action="store",
+    help="sample N*maxconfs conformers and choose the maxconformers with lowest energy (default 1)",
+    default="1",
+    type="float",
+    metavar="N",
+)
+parser.add_option(
+    "--seed",
+    dest="seed",
+    action="store",
+    help="random seed (default 9162006)",
+    default="9162006",
+    type="int",
+    metavar="s",
+)
+parser.add_option(
+    "--rms_threshold",
+    dest="rms",
+    action="store",
+    help="filter based on rms (default 0.7)",
+    default="0.7",
+    type="float",
+    metavar="R",
+)
+parser.add_option(
+    "--energy_window",
+    dest="energy",
+    action="store",
+    help="filter based on energy difference with lowest energy conformer",
+    default="10",
+    type="float",
+    metavar="E",
+)
+parser.add_option(
+    "-v",
+    "--verbose",
+    dest="verbose",
+    action="store_true",
+    default=False,
+    help="verbose output",
+)
+parser.add_option(
+    "--mmff",
+    dest="mmff",
+    action="store_true",
+    default=False,
+    help="use MMFF forcefield instead of UFF",
+)
+parser.add_option(
+    "--nomin",
+    dest="nomin",
+    action="store_true",
+    default=False,
+    help="don't perform energy minimization (bad idea)",
+)
+parser.add_option(
+    "--etkdg",
+    dest="etkdg",
+    action="store_true",
+    default=False,
+    help="use new ETKDG knowledge-based method instead of distance geometry",
+)
+
+
+(options, args) = parser.parse_args()
+
+if len(args) < 2:
+    parser.error("Need input and output")
+    sys.exit(-1)
+
+input = args[0]
+output = args[1]
+smifile = open(input)
+if options.verbose:
+    print("Generating a maximum of", options.maxconfs, "per a mol")
+
+if options.etkdg and not Chem.ETKDG:
+    print("ETKDB does not appear to be implemented.  Please upgrade RDKit.")
+    sys.exit(1)
+
+split = os.path.splitext(output)
+if split[1] == ".gz":
+    outf = gzip.open(output, "wt+")
+    output = split[0]  # strip .gz
+else:
+    outf = open(output, "w+")
+
+
+if os.path.splitext(output)[1] == ".pdb":
+    sdwriter = Chem.PDBWriter(outf)
+else:
+    sdwriter = Chem.SDWriter(outf)
+
+if sdwriter is None:
+    print("Could not open ".output)
+    sys.exit(-1)
+
+for line in smifile:
+    toks = line.split()
+    smi = toks[0]
+    name = " ".join(toks[1:])
+
+    pieces = smi.split(".")
+    if len(pieces) > 1:
+        smi = max(pieces, key=len)  # take largest component by length
+        print("Taking largest component: %s\t%s" % (smi, name))
+
+    mol = Chem.MolFromSmiles(smi)
+    if mol is not None:
+        if options.verbose:
+            print(smi)
+        try:
+            Chem.SanitizeMol(mol)
+            mol = Chem.AddHs(mol)
+            mol.SetProp("_Name", name)
+
+            if options.etkdg:
+                cids = Chem.EmbedMultipleConfs(
+                    mol, int(options.sample * options.maxconfs), Chem.ETKDG()
+                )
+            else:
+                cids = Chem.EmbedMultipleConfs(
+                    mol, int(options.sample * options.maxconfs), randomSeed=options.seed
+                )
+            if options.verbose:
+                print(len(cids), "conformers found")
+            cenergy = []
+            for conf in cids:
+                # not passing confID only minimizes the first conformer
+                if options.nomin:
+                    cenergy.append(conf)
+                elif options.mmff:
+                    converged = Chem.MMFFOptimizeMolecule(mol, confId=conf)
+                    mp = Chem.MMFFGetMoleculeProperties(mol)
+                    cenergy.append(
+                        Chem.MMFFGetMoleculeForceField(
+                            mol, mp, confId=conf
+                        ).CalcEnergy()
+                    )
+                else:
+                    converged = not Chem.UFFOptimizeMolecule(mol, confId=conf)
+                    cenergy.append(
+                        Chem.UFFGetMoleculeForceField(mol, confId=conf).CalcEnergy()
+                    )
+                if options.verbose:
+                    print("Convergence of conformer", conf, converged)
+
+            mol = Chem.RemoveHs(mol)
+            sortedcids = sorted(cids, key=lambda cid: cenergy[cid])
+            if len(sortedcids) > 0:
+                mine = cenergy[sortedcids[0]]
+            else:
+                mine = 0
+            if options.rms == 0:
+                cnt = 0
+                for conf in sortedcids:
+                    if cnt >= options.maxconfs:
+                        break
+                    if (options.energy < 0) or cenergy[conf] - mine <= options.energy:
+                        sdwriter.write(mol, conf)
+                        cnt += 1
+            else:
+                written = {}
+                for conf in sortedcids:
+                    if len(written) >= options.maxconfs:
+                        break
+                    # check rmsd
+                    passed = True
+                    for seenconf in written.keys():
+                        rms = getRMS(mol, seenconf, conf)
+                        if (rms < options.rms) or (
+                            options.energy > 0 and cenergy[conf] - mine > options.energy
+                        ):
+                            passed = False
+                            break
+                    if passed:
+                        written[conf] = True
+                        sdwriter.write(mol, conf)
+        except (KeyboardInterrupt, SystemExit):
+            raise
+        except Exception as e:
+            print("Exception", e)
+    else:
+        print("ERROR:", smi)
+
+sdwriter.close()
+outf.close()
diff -r 351fbd750a6d -r 4beb3e026bbb rdkit_descriptors.py
--- a/rdkit_descriptors.py	Wed Feb 17 13:00:12 2021 +0000
+++ b/rdkit_descriptors.py	Sat Dec 04 16:39:05 2021 +0000
@@ -8,7 +8,7 @@
 from rdkit.Chem import Descriptors
 
 
-def get_supplier(infile, format='smiles'):
+def get_supplier(infile, format="smiles"):
     """
     Returns a generator over a SMILES or InChI file. Every element is of RDKit
     molecule and has its original string as _Name property.
@@ -16,14 +16,20 @@
     with open(infile) as handle:
         for line in handle:
             line = line.strip()
-            if format == 'smiles':
+            if format == "smiles":
                 mol = Chem.MolFromSmiles(line, sanitize=True)
-            elif format == 'inchi':
-                mol = Chem.inchi.MolFromInchi(line, sanitize=True, removeHs=True, logLevel=None, treatWarningAsError=False)
+            elif format == "inchi":
+                mol = Chem.inchi.MolFromInchi(
+                    line,
+                    sanitize=True,
+                    removeHs=True,
+                    logLevel=None,
+                    treatWarningAsError=False,
+                )
             if mol is None:
                 yield False
             else:
-                mol.SetProp('_Name', line.split('\t')[0])
+                mol.SetProp("_Name", line.split("\t")[0])
                 yield mol
 
 
@@ -31,9 +37,13 @@
     """
     Returns all descriptor functions under the Chem.Descriptors Module as tuple of (name, function)
     """
-    ret = [(name, f) for name, f in inspect.getmembers(Descriptors) if inspect.isfunction(f) and not name.startswith('_')]
+    ret = [
+        (name, f)
+        for name, f in inspect.getmembers(Descriptors)
+        if inspect.isfunction(f) and not name.startswith("_")
+    ]
     # some which are not in the official Descriptors module we need to add manually
-    ret.extend([('FormalCharge', Chem.GetFormalCharge), ('SSSR', Chem.GetSSSR)])
+    ret.extend([("FormalCharge", Chem.GetFormalCharge), ("SSSR", Chem.GetSSSR)])
     ret.sort()
     return ret
 
@@ -48,40 +58,54 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
+    parser.add_argument("-i", "--infile", required=True, help="Path to the input file.")
     parser.add_argument("--iformat", help="Specify the input file format.")
 
-    parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
-                        default=sys.stdout,
-                        help="path to the result file, default is stdout")
+    parser.add_argument(
+        "-o",
+        "--outfile",
+        type=argparse.FileType("w+"),
+        default=sys.stdout,
+        help="path to the result file, default is stdout",
+    )
 
-    parser.add_argument('-s', '--select', default=None,
-                        help="select a subset of comma-separated descriptors to use")
+    parser.add_argument(
+        "-s",
+        "--select",
+        default=None,
+        help="select a subset of comma-separated descriptors to use",
+    )
 
-    parser.add_argument("--header", dest="header", action="store_true",
-                        default=False,
-                        help="Write header line.")
+    parser.add_argument(
+        "--header",
+        dest="header",
+        action="store_true",
+        default=False,
+        help="Write header line.",
+    )
 
     args = parser.parse_args()
 
-    if args.iformat == 'sdf':
+    if args.iformat == "sdf":
         supplier = Chem.SDMolSupplier(args.infile)
-    elif args.iformat == 'smi':
-        supplier = get_supplier(args.infile, format='smiles')
-    elif args.iformat == 'inchi':
-        supplier = get_supplier(args.infile, format='inchi')
-    elif args.iformat == 'pdb':
+    elif args.iformat == "smi":
+        supplier = get_supplier(args.infile, format="smiles")
+    elif args.iformat == "inchi":
+        supplier = get_supplier(args.infile, format="inchi")
+    elif args.iformat == "pdb":
         supplier = [Chem.MolFromPDBFile(args.infile)]
-    elif args.iformat == 'mol2':
+    elif args.iformat == "mol2":
         supplier = [Chem.MolFromMol2File(args.infile)]
 
     functions = get_rdkit_descriptor_functions()
-    if args.select and args.select != 'None':
-        selected = args.select.split(',')
+    if args.select and args.select != "None":
+        selected = args.select.split(",")
         functions = [(name, f) for name, f in functions if name in selected]
 
     if args.header:
-        args.outfile.write('%s\n' % '\t'.join(['MoleculeID'] + [name for name, f in functions]))
+        args.outfile.write(
+            "%s\n" % "\t".join(["MoleculeID"] + [name for name, f in functions])
+        )
 
     for mol in supplier:
         if not mol:
@@ -91,4 +115,7 @@
             molecule_id = mol.GetProp("_Name")
         except KeyError:
             molecule_id = Chem.MolToSmiles(mol)
-        args.outfile.write("%s\n" % '\t'.join([molecule_id] + [str(round(res, 6)) for name, res in descs]))
+        args.outfile.write(
+            "%s\n"
+            % "\t".join([molecule_id] + [str(round(res, 6)) for name, res in descs])
+        )
diff -r 351fbd750a6d -r 4beb3e026bbb sdf_to_tab.py
--- a/sdf_to_tab.py	Wed Feb 17 13:00:12 2021 +0000
+++ b/sdf_to_tab.py	Sat Dec 04 16:39:05 2021 +0000
@@ -13,36 +13,55 @@
         if mols[n]:
             d = mols[n].GetPropsAsDict()
             # filter dict for desired props
-            if vars.props.strip() == '':  # none specified, return all
-                d = {prop: val for (prop, val) in d.items() if not any(x in str(val) for x in ['\n', '\t'])}  # remove items containing newlines or tabs
+            if vars.props.strip() == "":  # none specified, return all
+                d = {
+                    prop: val
+                    for (prop, val) in d.items()
+                    if not any(x in str(val) for x in ["\n", "\t"])
+                }  # remove items containing newlines or tabs
             else:
-                d = {prop: val for (prop, val) in d.items() if prop in vars.props.replace(' ', '').split(',')}  # remove items not requested via CLI
+                d = {
+                    prop: val
+                    for (prop, val) in d.items()
+                    if prop in vars.props.replace(" ", "").split(",")
+                }  # remove items not requested via CLI
             if vars.name:
-                d['SDFMoleculeName'] = mols[n].GetProp('_Name')
+                d["SDFMoleculeName"] = mols[n].GetProp("_Name")
             if vars.smiles:
-                d['SMILES'] = Chem.MolToSmiles(mols[n], isomericSmiles=False)
-            d['Index'] = int(n)
+                d["SMILES"] = Chem.MolToSmiles(mols[n], isomericSmiles=False)
+            d["Index"] = int(n)
 
             df = df.append(d, ignore_index=True)
         else:
             print("Molecule could not be read - skipped.")
 
-    df = df.astype({'Index': int}).set_index('Index')
+    df = df.astype({"Index": int}).set_index("Index")
     sorted_cols = sorted(df.columns.values.tolist())
-    df.to_csv(vars.out, sep='\t', header=vars.header, columns=sorted_cols)
+    df.to_csv(vars.out, sep="\t", header=vars.header, columns=sorted_cols)
 
 
 def main():
     parser = argparse.ArgumentParser(description="Convert SDF to tabular")
-    parser.add_argument('--inp', '-i', help="The input file", required=True)
-    parser.add_argument('--out', '-o', help="The output file", required=True)
-    parser.add_argument('--props', '-p', help="Properties to filter (leave blank for all)", required=True)
-    parser.add_argument('--header', '-t', action='store_true',
-                        help="Write property name as the first row.")
-    parser.add_argument('--smiles', '-s', action='store_true',
-                        help="Include SMILES in output.")
-    parser.add_argument('--name', '-n', action='store_true',
-                        help="Include molecule name in output.")
+    parser.add_argument("--inp", "-i", help="The input file", required=True)
+    parser.add_argument("--out", "-o", help="The output file", required=True)
+    parser.add_argument(
+        "--props",
+        "-p",
+        help="Properties to filter (leave blank for all)",
+        required=True,
+    )
+    parser.add_argument(
+        "--header",
+        "-t",
+        action="store_true",
+        help="Write property name as the first row.",
+    )
+    parser.add_argument(
+        "--smiles", "-s", action="store_true", help="Include SMILES in output."
+    )
+    parser.add_argument(
+        "--name", "-n", action="store_true", help="Include molecule name in output."
+    )
     sdf_to_tab(parser.parse_args())
 
 
diff -r 351fbd750a6d -r 4beb3e026bbb test-data/rdconf_output.sdf
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rdconf_output.sdf	Sat Dec 04 16:39:05 2021 +0000
@@ -0,0 +1,166 @@
+staurosporine
+     RDKit          3D
+
+ 35 42  0  0  0  0  0  0  0  0999 V2000
+   -2.1656    1.4438   -2.0402 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.5064    0.5224   -1.0006 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -2.5333    0.2902    0.0771 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -3.0448   -1.1355   -0.1222 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -1.8499   -2.0325    0.1086 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.9248   -1.7530   -1.0664 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -1.2333   -0.6270   -1.7456 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.4794   -1.8256   -0.6986 N   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2443   -2.8970   -0.6134 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.9172   -4.2121   -0.8557 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.8835   -5.2113   -0.7023 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.1496   -4.8403   -0.3079 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4981   -3.5086   -0.0573 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.5336   -2.5530   -0.2153 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.5282   -1.1902   -0.0555 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.4701   -0.2506    0.3363 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.1276    1.0747    0.4177 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.8462    1.4665    0.1077 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2026    2.7102    0.1107 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.7041    3.9456    0.4421 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.8338    5.0265    0.3768 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.4727    4.8205   -0.0097 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.9490    3.5396   -0.3412 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.0910    2.4739   -0.2786 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.2989    1.1681   -0.5288 N   0  0  0  0  0  0  0  0  0  0  0  0
+    0.9108    0.5646   -0.2791 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2388   -0.7709   -0.3642 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.2788    1.9083    0.8642 C   0  0  0  0  0  0  0  0  0  0  0  0
+    5.3369    0.9521    1.0170 N   0  0  0  0  0  0  0  0  0  0  0  0
+    4.8631   -0.3694    0.7084 C   0  0  0  0  0  0  0  0  0  0  0  0
+    5.6279   -1.3464    0.7907 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.1928   -1.4531    0.6537 N   0  0  0  0  0  0  0  0  0  0  0  0
+   -5.2852   -0.5939    0.2385 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.8857    0.2845    1.3121 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.4801    1.0939    2.2570 C   0  0  0  0  0  0  0  0  0  0  0  0
+  2  1  1  6
+  2  3  1  0
+  3  4  1  0
+  4  5  1  0
+  6  5  1  1
+  6  7  1  0
+  6  8  1  0
+  8  9  1  0
+  9 10  2  0
+ 10 11  1  0
+ 11 12  2  0
+ 12 13  1  0
+ 13 14  2  0
+ 14 15  1  0
+ 15 16  2  0
+ 16 17  1  0
+ 17 18  2  0
+ 18 19  1  0
+ 19 20  2  0
+ 20 21  1  0
+ 21 22  2  0
+ 22 23  1  0
+ 23 24  2  0
+ 24 25  1  0
+ 25 26  1  0
+ 26 27  2  0
+ 17 28  1  0
+ 28 29  1  0
+ 29 30  1  0
+ 30 31  2  0
+  4 32  1  1
+ 32 33  1  0
+  3 34  1  1
+ 34 35  1  0
+  7  2  1  0
+ 25  2  1  0
+ 27  8  1  0
+ 14  9  1  0
+ 27 15  1  0
+ 30 16  1  0
+ 26 18  1  0
+ 24 19  1  0
+M  END
+$$$$
+staurosporine
+     RDKit          3D
+
+ 35 42  0  0  0  0  0  0  0  0999 V2000
+   -2.3068    0.9355   -2.4621 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.6484    0.1936   -1.2955 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -2.6628   -0.4491   -0.4739 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -2.1102   -1.3841    0.5829 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -1.3580   -2.4343   -0.1341 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6301   -1.9615   -1.3703 C   0  0  1  0  0  0  0  0  0  0  0  0
+   -1.0034   -0.8456   -1.9685 O   0  0  0  0  0  0  0  0  0  0  0  0
+    0.7323   -1.7671   -0.8854 N   0  0  0  0  0  0  0  0  0  0  0  0
+    1.6653   -2.7175   -0.8097 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.6030   -4.0416   -1.1929 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.7336   -4.8209   -0.9993 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.8672   -4.2655   -0.4408 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.9097   -2.9234   -0.0599 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.7941   -2.1557   -0.2513 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.5524   -0.8186    0.0257 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.2591    0.2074    0.5765 C   0  0  0  0  0  0  0  0  0  0  0  0
+    2.6138    1.4431    0.6965 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.3053    1.6277    0.2726 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.4639    2.7123    0.2701 C   0  0  0  0  0  0  0  0  0  0  0  0
+    0.6421    4.0028    0.7116 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.3869    4.9173    0.5986 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.5824    4.5165    0.0416 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.7175    3.2124   -0.3878 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.7245    2.2626   -0.2991 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -0.6365    0.9796   -0.6340 N   0  0  0  0  0  0  0  0  0  0  0  0
+    0.6078    0.6133   -0.2727 C   0  0  0  0  0  0  0  0  0  0  0  0
+    1.2251   -0.6244   -0.3999 C   0  0  0  0  0  0  0  0  0  0  0  0
+    3.5496    2.4028    1.3130 C   0  0  0  0  0  0  0  0  0  0  0  0
+    4.7624    1.6652    1.5409 N   0  0  0  0  0  0  0  0  0  0  0  0
+    4.5952    0.3088    1.0910 C   0  0  0  0  0  0  0  0  0  0  0  0
+    5.5219   -0.5061    1.1939 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -1.3679   -0.6619    1.5643 N   0  0  0  0  0  0  0  0  0  0  0  0
+   -2.2073    0.2673    2.2885 C   0  0  0  0  0  0  0  0  0  0  0  0
+   -3.6735    0.2555    0.0846 O   0  0  0  0  0  0  0  0  0  0  0  0
+   -4.9519   -0.0773   -0.2685 C   0  0  0  0  0  0  0  0  0  0  0  0
+  2  1  1  6
+  2  3  1  0
+  3  4  1  0
+  4  5  1  0
+  6  5  1  1
+  6  7  1  0
+  6  8  1  0
+  8  9  1  0
+  9 10  2  0
+ 10 11  1  0
+ 11 12  2  0
+ 12 13  1  0
+ 13 14  2  0
+ 14 15  1  0
+ 15 16  2  0
+ 16 17  1  0
+ 17 18  2  0
+ 18 19  1  0
+ 19 20  2  0
+ 20 21  1  0
+ 21 22  2  0
+ 22 23  1  0
+ 23 24  2  0
+ 24 25  1  0
+ 25 26  1  0
+ 26 27  2  0
+ 17 28  1  0
+ 28 29  1  0
+ 29 30  1  0
+ 30 31  2  0
+  4 32  1  1
+ 32 33  1  0
+  3 34  1  1
+ 34 35  1  0
+  7  2  1  0
+ 25  2  1  0
+ 27  8  1  0
+ 14  9  1  0
+ 27 15  1  0
+ 30 16  1  0
+ 26 18  1  0
+ 24 19  1  0
+M  END
+$$$$
diff -r 351fbd750a6d -r 4beb3e026bbb test-data/staurosporine.smi
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/staurosporine.smi	Sat Dec 04 16:39:05 2021 +0000
@@ -0,0 +1,1 @@
+C[C@@]12[C@@H]([C@@H](C[C@@H](O1)N3C4=CC=CC=C4C5=C6C(=C7C8=CC=CC=C8N2C7=C53)CNC6=O)NC)OC	staurosporine