Mercurial > repos > bgruening > mordred
diff mordred_descriptors.py @ 0:ea68b86303e2 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
author | bgruening |
---|---|
date | Thu, 23 May 2019 18:31:43 -0400 |
parents | |
children | e2f40a02f31a |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mordred_descriptors.py Thu May 23 18:31:43 2019 -0400 @@ -0,0 +1,71 @@ +import argparse +import numpy as np +import pandas as pd +from mordred import Calculator, descriptors +from mordred.error import Missing, Error +from rdkit import Chem +from rdkit.Chem.rdmolfiles import SDMolSupplier, SmilesMolSupplier + + +def convert_errors_to_nan(el): + """ + Remove elements from the Mordred dataframe which are not + in float or int format + """ + if type(el) == bool: + return int(el) + if type(el) not in [float, int, np.float64]: + return None + return el + + +def mol_supplier(filename, ext): + """ + Based on the file extension, use the appropriate RDKit function to + load a chemical data file (SMILES or SDF) containing multiple molecules + and return a list of RDKit Mol objects + """ + if ext == 'sdf': + return [n for n in SDMolSupplier(filename)] + with open(filename) as f: + mols = f.read().split('\n') + if ext == 'smi': + return [Chem.MolFromSmiles(mol, sanitize=True) for mol in mols] + if ext == 'inchi': + return [Chem.inchi.MolFromInchi(mol, sanitize=True) for mol in mols] + + +def mordred_descriptors(mols, output, header, use_3d): + """ + Calculate Mordred descriptors and save as tabular + """ + calc = Calculator(descriptors, ignore_3D=(not use_3d)) + invalid_mols = np.where(np.array(mols) == None)[0] # indices of invalid SMILES/SDMols + mols = [Chem.MolFromSmiles('') if n is None else n for n in mols] # replace invalid mols with placeholder + df = calc.pandas(mols, quiet=True) # calculate descriptors + for mol in invalid_mols: # remove placeholders + df.iloc[mol] = np.nan + df = df.applymap(convert_errors_to_nan) # remove descriptors which errored + df = df.round(6) + df.to_csv(output, na_rep='', sep='\t', index=False, header=header) # write output + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('-i', '--infile', required=True, help='Path to the input file.') + parser.add_argument("--iformat", help="Specify the input file format.") + + parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), + help="Path to the result file") + + parser.add_argument("--3d", dest="use_3d", action="store_true", + default=False, + help="Use 3d descriptors - only with SDF input.") + + parser.add_argument("--header", dest="header", action="store_true", + default=False, + help="Write header line.") + args = parser.parse_args() + + mols = mol_supplier(args.infile, args.iformat) + mordred_descriptors(mols, args.outfile, args.header, args.use_3d) \ No newline at end of file