Mercurial > repos > bgruening > mordred
comparison mordred_descriptors.py @ 0:ea68b86303e2 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
author | bgruening |
---|---|
date | Thu, 23 May 2019 18:31:43 -0400 |
parents | |
children | e2f40a02f31a |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:ea68b86303e2 |
---|---|
1 import argparse | |
2 import numpy as np | |
3 import pandas as pd | |
4 from mordred import Calculator, descriptors | |
5 from mordred.error import Missing, Error | |
6 from rdkit import Chem | |
7 from rdkit.Chem.rdmolfiles import SDMolSupplier, SmilesMolSupplier | |
8 | |
9 | |
10 def convert_errors_to_nan(el): | |
11 """ | |
12 Remove elements from the Mordred dataframe which are not | |
13 in float or int format | |
14 """ | |
15 if type(el) == bool: | |
16 return int(el) | |
17 if type(el) not in [float, int, np.float64]: | |
18 return None | |
19 return el | |
20 | |
21 | |
22 def mol_supplier(filename, ext): | |
23 """ | |
24 Based on the file extension, use the appropriate RDKit function to | |
25 load a chemical data file (SMILES or SDF) containing multiple molecules | |
26 and return a list of RDKit Mol objects | |
27 """ | |
28 if ext == 'sdf': | |
29 return [n for n in SDMolSupplier(filename)] | |
30 with open(filename) as f: | |
31 mols = f.read().split('\n') | |
32 if ext == 'smi': | |
33 return [Chem.MolFromSmiles(mol, sanitize=True) for mol in mols] | |
34 if ext == 'inchi': | |
35 return [Chem.inchi.MolFromInchi(mol, sanitize=True) for mol in mols] | |
36 | |
37 | |
38 def mordred_descriptors(mols, output, header, use_3d): | |
39 """ | |
40 Calculate Mordred descriptors and save as tabular | |
41 """ | |
42 calc = Calculator(descriptors, ignore_3D=(not use_3d)) | |
43 invalid_mols = np.where(np.array(mols) == None)[0] # indices of invalid SMILES/SDMols | |
44 mols = [Chem.MolFromSmiles('') if n is None else n for n in mols] # replace invalid mols with placeholder | |
45 df = calc.pandas(mols, quiet=True) # calculate descriptors | |
46 for mol in invalid_mols: # remove placeholders | |
47 df.iloc[mol] = np.nan | |
48 df = df.applymap(convert_errors_to_nan) # remove descriptors which errored | |
49 df = df.round(6) | |
50 df.to_csv(output, na_rep='', sep='\t', index=False, header=header) # write output | |
51 | |
52 | |
53 if __name__ == "__main__": | |
54 parser = argparse.ArgumentParser() | |
55 parser.add_argument('-i', '--infile', required=True, help='Path to the input file.') | |
56 parser.add_argument("--iformat", help="Specify the input file format.") | |
57 | |
58 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), | |
59 help="Path to the result file") | |
60 | |
61 parser.add_argument("--3d", dest="use_3d", action="store_true", | |
62 default=False, | |
63 help="Use 3d descriptors - only with SDF input.") | |
64 | |
65 parser.add_argument("--header", dest="header", action="store_true", | |
66 default=False, | |
67 help="Write header line.") | |
68 args = parser.parse_args() | |
69 | |
70 mols = mol_supplier(args.infile, args.iformat) | |
71 mordred_descriptors(mols, args.outfile, args.header, args.use_3d) |