Mercurial > repos > bgruening > sucos_clustering
comparison sucos.py @ 0:f80cfac80c53 draft
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author | bgruening |
---|---|
date | Wed, 02 Oct 2019 12:58:19 -0400 |
parents | |
children | 58d18838e244 |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:f80cfac80c53 |
---|---|
1 #!/usr/bin/env python | |
2 """ | |
3 Basic SuCOS scoring. Allows a set of molecules from a SD file to be overlayed to a reference molecule, | |
4 with the resulting scores being written as properties in the output SD file. | |
5 | |
6 SuCOS is the work of Susan Leung. | |
7 GitHub: https://github.com/susanhleung/SuCOS | |
8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | |
9 """ | |
10 | |
11 from __future__ import print_function | |
12 import argparse, os, sys, gzip | |
13 import numpy as np | |
14 from rdkit import Chem, rdBase, RDConfig | |
15 from rdkit.Chem import AllChem, rdShapeHelpers | |
16 from rdkit.Chem.FeatMaps import FeatMaps | |
17 import utils | |
18 | |
19 | |
20 ### start function definitions ######################################### | |
21 | |
22 # Setting up the features to use in FeatureMap | |
23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) | |
24 | |
25 fmParams = {} | |
26 for k in fdef.GetFeatureFamilies(): | |
27 fparams = FeatMaps.FeatMapParams() | |
28 fmParams[k] = fparams | |
29 | |
30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', | |
31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') | |
32 | |
33 def filterFeature(f): | |
34 result = f.GetFamily() in keep | |
35 # TODO - nothing ever seems to be filtered. Is this expected? | |
36 if not result: | |
37 utils.log("Filtered out feature type", f.GetFamily()) | |
38 return result | |
39 | |
40 def getRawFeatures(mol): | |
41 | |
42 rawFeats = fdef.GetFeaturesForMol(mol) | |
43 # filter that list down to only include the ones we're interested in | |
44 filtered = list(filter(filterFeature, rawFeats)) | |
45 return filtered | |
46 | |
47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | |
48 """ | |
49 Generate the feature map score. | |
50 | |
51 :param small_feats: | |
52 :param large_feats: | |
53 :param tani: | |
54 :return: | |
55 """ | |
56 | |
57 featLists = [] | |
58 for rawFeats in [small_feats, large_feats]: | |
59 # filter that list down to only include the ones we're interested in | |
60 featLists.append(rawFeats) | |
61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] | |
62 # set the score mode | |
63 fms[0].scoreMode = score_mode | |
64 | |
65 try: | |
66 if tani: | |
67 c = fms[0].ScoreFeats(featLists[1]) | |
68 A = fms[0].GetNumFeatures() | |
69 B = len(featLists[1]) | |
70 if B != fms[1].GetNumFeatures(): | |
71 utils.log("Why isn't B equal to number of features...?!") | |
72 tani_score = float(c) / (A+B-c) | |
73 return tani_score | |
74 else: | |
75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | |
76 return fm_score | |
77 except ZeroDivisionError: | |
78 utils.log("ZeroDivisionError") | |
79 return 0 | |
80 | |
81 if tani: | |
82 tani_score = float(c) / (A+B-c) | |
83 return tani_score | |
84 else: | |
85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | |
86 return fm_score | |
87 | |
88 | |
89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): | |
90 """ | |
91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. | |
92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having | |
93 to recalculate them. Use the getRawFeatures function to pre-calculate the features. | |
94 | |
95 :param ref_mol: The reference molecule to compare to | |
96 :param query_mol: The molecule to align to the reference | |
97 :param tani: Whether to calculate Tanimoto distances | |
98 :param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it. | |
99 :param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it. | |
100 :return: A tuple of 3 values. 1 the sucos score, 2 the feature map score, | |
101 3 the Tanimoto distance or 1 minus the protrude distance | |
102 """ | |
103 | |
104 if not ref_features: | |
105 ref_features = getRawFeatures(ref_mol) | |
106 if not query_features: | |
107 query_features = getRawFeatures(query_mol) | |
108 | |
109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) | |
110 fm_score = np.clip(fm_score, 0, 1) | |
111 | |
112 if tani: | |
113 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) | |
114 tani_sim = np.clip(tani_sim, 0, 1) | |
115 SuCOS_score = 0.5*fm_score + 0.5*tani_sim | |
116 return SuCOS_score, fm_score, tani_sim | |
117 else: | |
118 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) | |
119 protrude_dist = np.clip(protrude_dist, 0, 1) | |
120 protrude_val = 1.0 - protrude_dist | |
121 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val | |
122 return SuCOS_score, fm_score, protrude_val | |
123 | |
124 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, | |
125 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | |
126 | |
127 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) | |
128 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | |
129 ref_features = getRawFeatures(ref_mol) | |
130 | |
131 input_file = utils.open_file_for_reading(inputs_filename) | |
132 suppl = Chem.ForwardSDMolSupplier(input_file) | |
133 output_file = utils.open_file_for_writing(outputs_filename) | |
134 writer = Chem.SDWriter(output_file) | |
135 | |
136 count = 0 | |
137 total = 0 | |
138 errors = 0 | |
139 for mol in suppl: | |
140 count +=1 | |
141 if mol is None: | |
142 continue | |
143 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") | |
144 try: | |
145 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) | |
146 mol.SetDoubleProp("SuCOS_Score", sucos_score) | |
147 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) | |
148 if tani: | |
149 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) | |
150 else: | |
151 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) | |
152 utils.log("Scores:", sucos_score, fm_score, val3) | |
153 writer.write(mol) | |
154 total +=1 | |
155 except ValueError as e: | |
156 errors +=1 | |
157 utils.log("Molecule", count, "failed to score:", e.message) | |
158 | |
159 input_file.close() | |
160 writer.flush() | |
161 writer.close() | |
162 output_file.close() | |
163 | |
164 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") | |
165 | |
166 def parse_score_mode(value): | |
167 if value == None or value == 'all': | |
168 return FeatMaps.FeatMapScoreMode.All | |
169 elif value == 'closest': | |
170 return FeatMaps.FeatMapScoreMode.Closest | |
171 elif value == 'best': | |
172 return FeatMaps.FeatMapScoreMode.Best | |
173 else: | |
174 raise ValueError(value + " is not a valid scoring mode option") | |
175 | |
176 | |
177 ### start main execution ######################################### | |
178 | |
179 def main(): | |
180 | |
181 parser = argparse.ArgumentParser(description='SuCOS with RDKit') | |
182 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | |
183 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') | |
184 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + | |
185 "Only needed if files don't have the expected extensions") | |
186 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) | |
187 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | |
188 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') | |
189 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], | |
190 help="choose the scoring mode for the feature map, default is 'all'.") | |
191 | |
192 args = parser.parse_args() | |
193 utils.log("SuCOS Args: ", args) | |
194 | |
195 score_mode = parse_score_mode(args.score_mode) | |
196 | |
197 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, | |
198 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) | |
199 | |
200 | |
201 if __name__ == "__main__": | |
202 main() |