Mercurial > repos > bgruening > sucos_clustering
comparison sucos.py @ 6:b8725fec8c7b draft default tip
"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit 05dc325ce687441e5d3bdbdedcc0e3529cd5e070"
author | bgruening |
---|---|
date | Wed, 14 Apr 2021 09:30:48 +0000 |
parents | 58d18838e244 |
children |
comparison
equal
deleted
inserted
replaced
5:12725d4b90f3 | 6:b8725fec8c7b |
---|---|
7 GitHub: https://github.com/susanhleung/SuCOS | 7 GitHub: https://github.com/susanhleung/SuCOS |
8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 | 8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1 |
9 """ | 9 """ |
10 | 10 |
11 from __future__ import print_function | 11 from __future__ import print_function |
12 import argparse, os, sys, gzip | 12 |
13 import argparse | |
14 import os | |
15 | |
13 import numpy as np | 16 import numpy as np |
14 from rdkit import Chem, rdBase, RDConfig | 17 import utils |
18 from rdkit import Chem, RDConfig | |
15 from rdkit.Chem import AllChem, rdShapeHelpers | 19 from rdkit.Chem import AllChem, rdShapeHelpers |
16 from rdkit.Chem.FeatMaps import FeatMaps | 20 from rdkit.Chem.FeatMaps import FeatMaps |
17 import utils | 21 |
18 | 22 # start function definitions ######################################### |
19 | |
20 ### start function definitions ######################################### | |
21 | 23 |
22 # Setting up the features to use in FeatureMap | 24 # Setting up the features to use in FeatureMap |
23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef')) | 25 fdef = AllChem.BuildFeatureFactory( |
26 os.path.join(RDConfig.RDDataDir, "BaseFeatures.fdef") | |
27 ) | |
24 | 28 |
25 fmParams = {} | 29 fmParams = {} |
26 for k in fdef.GetFeatureFamilies(): | 30 for k in fdef.GetFeatureFamilies(): |
27 fparams = FeatMaps.FeatMapParams() | 31 fparams = FeatMaps.FeatMapParams() |
28 fmParams[k] = fparams | 32 fmParams[k] = fparams |
29 | 33 |
30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder', | 34 keep = ( |
31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe') | 35 "Donor", |
36 "Acceptor", | |
37 "NegIonizable", | |
38 "PosIonizable", | |
39 "ZnBinder", | |
40 "Aromatic", | |
41 "Hydrophobe", | |
42 "LumpedHydrophobe", | |
43 ) | |
44 | |
32 | 45 |
33 def filterFeature(f): | 46 def filterFeature(f): |
34 result = f.GetFamily() in keep | 47 result = f.GetFamily() in keep |
35 # TODO - nothing ever seems to be filtered. Is this expected? | 48 # TODO - nothing ever seems to be filtered. Is this expected? |
36 if not result: | 49 if not result: |
37 utils.log("Filtered out feature type", f.GetFamily()) | 50 utils.log("Filtered out feature type", f.GetFamily()) |
38 return result | 51 return result |
39 | 52 |
53 | |
40 def getRawFeatures(mol): | 54 def getRawFeatures(mol): |
41 | 55 |
42 rawFeats = fdef.GetFeaturesForMol(mol) | 56 rawFeats = fdef.GetFeaturesForMol(mol) |
43 # filter that list down to only include the ones we're interested in | 57 # filter that list down to only include the ones we're interested in |
44 filtered = list(filter(filterFeature, rawFeats)) | 58 filtered = list(filter(filterFeature, rawFeats)) |
45 return filtered | 59 return filtered |
46 | 60 |
47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | 61 |
62 def get_FeatureMapScore( | |
63 small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All | |
64 ): | |
48 """ | 65 """ |
49 Generate the feature map score. | 66 Generate the feature map score. |
50 | 67 |
51 :param small_feats: | 68 :param small_feats: |
52 :param large_feats: | 69 :param large_feats: |
56 | 73 |
57 featLists = [] | 74 featLists = [] |
58 for rawFeats in [small_feats, large_feats]: | 75 for rawFeats in [small_feats, large_feats]: |
59 # filter that list down to only include the ones we're interested in | 76 # filter that list down to only include the ones we're interested in |
60 featLists.append(rawFeats) | 77 featLists.append(rawFeats) |
61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists] | 78 fms = [ |
79 FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) | |
80 for x in featLists | |
81 ] | |
62 # set the score mode | 82 # set the score mode |
63 fms[0].scoreMode = score_mode | 83 fms[0].scoreMode = score_mode |
64 | 84 |
65 try: | 85 try: |
66 if tani: | 86 if tani: |
67 c = fms[0].ScoreFeats(featLists[1]) | 87 c = fms[0].ScoreFeats(featLists[1]) |
68 A = fms[0].GetNumFeatures() | 88 A = fms[0].GetNumFeatures() |
69 B = len(featLists[1]) | 89 B = len(featLists[1]) |
70 if B != fms[1].GetNumFeatures(): | 90 if B != fms[1].GetNumFeatures(): |
71 utils.log("Why isn't B equal to number of features...?!") | 91 utils.log("Why isn't B equal to number of features...?!") |
72 tani_score = float(c) / (A+B-c) | 92 tani_score = float(c) / (A + B - c) |
73 return tani_score | 93 return tani_score |
74 else: | 94 else: |
75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | 95 fm_score = fms[0].ScoreFeats(featLists[1]) / min( |
96 fms[0].GetNumFeatures(), len(featLists[1]) | |
97 ) | |
76 return fm_score | 98 return fm_score |
77 except ZeroDivisionError: | 99 except ZeroDivisionError: |
78 utils.log("ZeroDivisionError") | 100 utils.log("ZeroDivisionError") |
79 return 0 | 101 return 0 |
80 | 102 |
81 if tani: | 103 if tani: |
82 tani_score = float(c) / (A+B-c) | 104 tani_score = float(c) / (A + B - c) |
83 return tani_score | 105 return tani_score |
84 else: | 106 else: |
85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1])) | 107 fm_score = fms[0].ScoreFeats(featLists[1]) / min( |
108 fms[0].GetNumFeatures(), len(featLists[1]) | |
109 ) | |
86 return fm_score | 110 return fm_score |
87 | 111 |
88 | 112 |
89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All): | 113 def get_SucosScore( |
114 ref_mol, | |
115 query_mol, | |
116 tani=False, | |
117 ref_features=None, | |
118 query_features=None, | |
119 score_mode=FeatMaps.FeatMapScoreMode.All, | |
120 ): | |
90 """ | 121 """ |
91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. | 122 This is the key function that calculates the SuCOS scores and is expected to be called from other modules. |
92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having | 123 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having |
93 to recalculate them. Use the getRawFeatures function to pre-calculate the features. | 124 to recalculate them. Use the getRawFeatures function to pre-calculate the features. |
94 | 125 |
105 ref_features = getRawFeatures(ref_mol) | 136 ref_features = getRawFeatures(ref_mol) |
106 if not query_features: | 137 if not query_features: |
107 query_features = getRawFeatures(query_mol) | 138 query_features = getRawFeatures(query_mol) |
108 | 139 |
109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) | 140 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode) |
110 fm_score = np.clip(fm_score, 0, 1) | 141 fm_score = np.float(np.clip(fm_score, 0, 1)) |
111 | 142 |
112 try : | 143 try: |
113 if tani: | 144 if tani: |
114 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) | 145 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol)) |
115 tani_sim = np.clip(tani_sim, 0, 1) | 146 tani_sim = np.clip(tani_sim, 0, 1) |
116 SuCOS_score = 0.5*fm_score + 0.5*tani_sim | 147 SuCOS_score = 0.5 * fm_score + 0.5 * tani_sim |
117 return SuCOS_score, fm_score, tani_sim | 148 return SuCOS_score, fm_score, tani_sim |
118 else: | 149 else: |
119 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False) | 150 protrude_dist = rdShapeHelpers.ShapeProtrudeDist( |
151 ref_mol, query_mol, allowReordering=False | |
152 ) | |
120 protrude_dist = np.clip(protrude_dist, 0, 1) | 153 protrude_dist = np.clip(protrude_dist, 0, 1) |
121 protrude_val = 1.0 - protrude_dist | 154 protrude_val = 1.0 - protrude_dist |
122 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val | 155 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val |
123 return SuCOS_score, fm_score, protrude_val | 156 return SuCOS_score, fm_score, protrude_val |
124 except: | 157 except Exception: |
125 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0") | 158 utils.log("Failed to calculate SuCOS scores. Returning 0,0,0") |
126 return 0, 0, 0 | 159 return 0, 0, 0 |
127 | 160 |
128 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None, | 161 |
129 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All): | 162 def process( |
130 | 163 refmol_filename, |
131 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format) | 164 inputs_filename, |
132 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | 165 outputs_filename, |
166 refmol_index=None, | |
167 refmol_format=None, | |
168 tani=False, | |
169 score_mode=FeatMaps.FeatMapScoreMode.All, | |
170 ): | |
171 | |
172 ref_mol = utils.read_single_molecule( | |
173 refmol_filename, index=refmol_index, format=refmol_format | |
174 ) | |
175 # utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms") | |
133 ref_features = getRawFeatures(ref_mol) | 176 ref_features = getRawFeatures(ref_mol) |
134 | 177 |
135 input_file = utils.open_file_for_reading(inputs_filename) | 178 input_file = utils.open_file_for_reading(inputs_filename) |
136 suppl = Chem.ForwardSDMolSupplier(input_file) | 179 suppl = Chem.ForwardSDMolSupplier(input_file) |
137 output_file = utils.open_file_for_writing(outputs_filename) | 180 output_file = utils.open_file_for_writing(outputs_filename) |
139 | 182 |
140 count = 0 | 183 count = 0 |
141 total = 0 | 184 total = 0 |
142 errors = 0 | 185 errors = 0 |
143 for mol in suppl: | 186 for mol in suppl: |
144 count +=1 | 187 count += 1 |
145 if mol is None: | 188 if mol is None: |
146 continue | 189 continue |
147 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") | 190 # utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms") |
148 try: | 191 try: |
149 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode) | 192 sucos_score, fm_score, val3 = get_SucosScore( |
193 ref_mol, | |
194 mol, | |
195 tani=tani, | |
196 ref_features=ref_features, | |
197 score_mode=score_mode, | |
198 ) | |
150 mol.SetDoubleProp("SuCOS_Score", sucos_score) | 199 mol.SetDoubleProp("SuCOS_Score", sucos_score) |
151 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) | 200 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score) |
152 if tani: | 201 if tani: |
153 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) | 202 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3) |
154 else: | 203 else: |
155 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) | 204 mol.SetDoubleProp("SuCOS_Protrude_Score", val3) |
156 utils.log("Scores:", sucos_score, fm_score, val3) | 205 utils.log("Scores:", sucos_score, fm_score, val3) |
157 writer.write(mol) | 206 writer.write(mol) |
158 total +=1 | 207 total += 1 |
159 except ValueError as e: | 208 except ValueError as e: |
160 errors +=1 | 209 errors += 1 |
161 utils.log("Molecule", count, "failed to score:", e.message) | 210 utils.log("Molecule", count, "failed to score:", e.message) |
162 | 211 |
163 input_file.close() | 212 input_file.close() |
164 writer.flush() | 213 writer.flush() |
165 writer.close() | 214 writer.close() |
166 output_file.close() | 215 output_file.close() |
167 | 216 |
168 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors") | 217 utils.log( |
218 "Completed.", total, "processed, ", count, "succeeded, ", errors, "errors" | |
219 ) | |
220 | |
169 | 221 |
170 def parse_score_mode(value): | 222 def parse_score_mode(value): |
171 if value == None or value == 'all': | 223 if value is None or value == "all": |
172 return FeatMaps.FeatMapScoreMode.All | 224 return FeatMaps.FeatMapScoreMode.All |
173 elif value == 'closest': | 225 elif value == "closest": |
174 return FeatMaps.FeatMapScoreMode.Closest | 226 return FeatMaps.FeatMapScoreMode.Closest |
175 elif value == 'best': | 227 elif value == "best": |
176 return FeatMaps.FeatMapScoreMode.Best | 228 return FeatMaps.FeatMapScoreMode.Best |
177 else: | 229 else: |
178 raise ValueError(value + " is not a valid scoring mode option") | 230 raise ValueError(value + " is not a valid scoring mode option") |
179 | 231 |
180 | 232 |
181 ### start main execution ######################################### | 233 # start main execution ######################################### |
234 | |
182 | 235 |
183 def main(): | 236 def main(): |
184 | 237 |
185 parser = argparse.ArgumentParser(description='SuCOS with RDKit') | 238 parser = argparse.ArgumentParser(description="SuCOS with RDKit") |
186 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).') | 239 parser.add_argument( |
187 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format') | 240 "-i", "--input", help="Input file in SDF format. Can be gzipped (*.gz)." |
188 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " + | 241 ) |
189 "Only needed if files don't have the expected extensions") | 242 parser.add_argument( |
190 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1) | 243 "-r", |
191 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).') | 244 "--refmol", |
192 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score') | 245 help="Molecule to compare against in Molfile (.mol) or SDF (.sdf) format", |
193 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'], | 246 ) |
194 help="choose the scoring mode for the feature map, default is 'all'.") | 247 parser.add_argument( |
248 "--refmol-format", | |
249 help="Format for the reference molecule (mol or sdf). " | |
250 + "Only needed if files don't have the expected extensions", | |
251 ) | |
252 parser.add_argument( | |
253 "--refmolidx", | |
254 help="Reference molecule index in SD file if not the first", | |
255 type=int, | |
256 default=1, | |
257 ) | |
258 parser.add_argument( | |
259 "-o", "--output", help="Output file in SDF format. Can be gzipped (*.gz)." | |
260 ) | |
261 parser.add_argument( | |
262 "--tanimoto", action="store_true", help="Include Tanimoto distance in score" | |
263 ) | |
264 parser.add_argument( | |
265 "--score_mode", | |
266 choices=["all", "closest", "best"], | |
267 help="choose the scoring mode for the feature map, default is 'all'.", | |
268 ) | |
195 | 269 |
196 args = parser.parse_args() | 270 args = parser.parse_args() |
197 utils.log("SuCOS Args: ", args) | 271 utils.log("SuCOS Args: ", args) |
198 | 272 |
199 score_mode = parse_score_mode(args.score_mode) | 273 score_mode = parse_score_mode(args.score_mode) |
200 | 274 |
201 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx, | 275 process( |
202 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode) | 276 args.refmol, |
277 args.input, | |
278 args.output, | |
279 refmol_index=args.refmolidx, | |
280 refmol_format=args.refmol_format, | |
281 tani=args.tanimoto, | |
282 score_mode=score_mode, | |
283 ) | |
203 | 284 |
204 | 285 |
205 if __name__ == "__main__": | 286 if __name__ == "__main__": |
206 main() | 287 main() |