annotate mordred_descriptors.py @ 3:cc0f89287ecf draft default tip

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 944ea4bb8a9cd4244152a4a4fecd0485fabc2ad0"
author bgruening
date Tue, 28 Jul 2020 08:31:30 -0400
parents d074b0c2b54f
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
1 import argparse
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
2 import numpy as np
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
3 import pandas as pd
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
4 from mordred import Calculator, descriptors
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
5 from mordred.error import Missing, Error
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
6 from rdkit import Chem
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
7 from rdkit.Chem.rdmolfiles import SDMolSupplier, SmilesMolSupplier
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
8
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
9
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
10 def convert_errors_to_nan(el):
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
11 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
12 Remove elements from the Mordred dataframe which are not
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
13 in float or int format
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
14 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
15 if type(el) == bool:
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
16 return int(el)
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
17 if type(el) not in [float, int, np.float64]:
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
18 return None
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
19 return el
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
20
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
21
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
22 def mol_supplier(filename, ext):
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
23 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
24 Based on the file extension, use the appropriate RDKit function to
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
25 load a chemical data file (SMILES or SDF) containing multiple molecules
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
26 and return a list of RDKit Mol objects
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
27 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
28 if ext == 'sdf':
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
29 return [n for n in SDMolSupplier(filename)]
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
30 with open(filename) as f:
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
31 mols = f.read().split('\n')
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
32 if ext == 'smi':
1
e2f40a02f31a planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7c47e7409a8cb19e20b4424329f5d0d9470f3b00
bgruening
parents: 0
diff changeset
33 return [Chem.MolFromSmiles(mol, sanitize=True) for mol in mols if mol != '']
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
34 if ext == 'inchi':
1
e2f40a02f31a planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7c47e7409a8cb19e20b4424329f5d0d9470f3b00
bgruening
parents: 0
diff changeset
35 return [Chem.inchi.MolFromInchi(mol, sanitize=True) for mol in mols if mol != '']
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
36
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
37
2
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
38 def mordred_descriptors(mols, output, header, use_3d, smi_as_col):
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
39 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
40 Calculate Mordred descriptors and save as tabular
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
41 """
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
42 calc = Calculator(descriptors, ignore_3D=(not use_3d))
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
43 invalid_mols = np.where(np.array(mols) == None)[0] # indices of invalid SMILES/SDMols
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
44 mols = [Chem.MolFromSmiles('') if n is None else n for n in mols] # replace invalid mols with placeholder
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
45 df = calc.pandas(mols, quiet=True) # calculate descriptors
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
46 for mol in invalid_mols: # remove placeholders
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
47 df.iloc[mol] = np.nan
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
48 df = df.applymap(convert_errors_to_nan) # remove descriptors which errored
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
49 df = df.round(6)
2
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
50 if smi_as_col:
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
51 smiles = [Chem.MolToSmiles(mol) for mol in mols]
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
52 df['SMILES'] = smiles
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
53
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
54 df.to_csv(output, na_rep='', sep='\t', index=False, header=header) # write output
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
55
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
56
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
57 if __name__ == "__main__":
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
58 parser = argparse.ArgumentParser()
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
59 parser.add_argument('-i', '--infile', required=True, help='Path to the input file.')
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
60 parser.add_argument("--iformat", help="Specify the input file format.")
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
61
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
62 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
63 help="Path to the result file")
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
64
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
65 parser.add_argument("--3d", dest="use_3d", action="store_true",
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
66 default=False,
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
67 help="Use 3d descriptors - only with SDF input.")
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
68
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
69 parser.add_argument("--header", dest="header", action="store_true",
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
70 default=False,
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
71 help="Write header line.")
2
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
72
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
73 parser.add_argument("--smiles", dest="smiles", action="store_true",
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
74 default=False,
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
75 help="Add a column with compound SMILES.")
0
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
76 args = parser.parse_args()
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
77
ea68b86303e2 planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 4ce352d9c9f3f1808e2ab6c019c534fd3e805959
bgruening
parents:
diff changeset
78 mols = mol_supplier(args.infile, args.iformat)
2
d074b0c2b54f "planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/mordred commit 7efc367809c29ff5939ea971bd00c69b7f9f5903"
bgruening
parents: 1
diff changeset
79 mordred_descriptors(mols, args.outfile, args.header, args.use_3d, args.smiles)