comparison sucos.py @ 0:f80cfac80c53 draft

"planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/sucos commit ef86cfa5f7ab5043de420511211579d03df58645"
author bgruening
date Wed, 02 Oct 2019 12:58:19 -0400
parents
children 58d18838e244
comparison
equal deleted inserted replaced
-1:000000000000 0:f80cfac80c53
1 #!/usr/bin/env python
2 """
3 Basic SuCOS scoring. Allows a set of molecules from a SD file to be overlayed to a reference molecule,
4 with the resulting scores being written as properties in the output SD file.
5
6 SuCOS is the work of Susan Leung.
7 GitHub: https://github.com/susanhleung/SuCOS
8 Publication: https://doi.org/10.26434/chemrxiv.8100203.v1
9 """
10
11 from __future__ import print_function
12 import argparse, os, sys, gzip
13 import numpy as np
14 from rdkit import Chem, rdBase, RDConfig
15 from rdkit.Chem import AllChem, rdShapeHelpers
16 from rdkit.Chem.FeatMaps import FeatMaps
17 import utils
18
19
20 ### start function definitions #########################################
21
22 # Setting up the features to use in FeatureMap
23 fdef = AllChem.BuildFeatureFactory(os.path.join(RDConfig.RDDataDir, 'BaseFeatures.fdef'))
24
25 fmParams = {}
26 for k in fdef.GetFeatureFamilies():
27 fparams = FeatMaps.FeatMapParams()
28 fmParams[k] = fparams
29
30 keep = ('Donor', 'Acceptor', 'NegIonizable', 'PosIonizable', 'ZnBinder',
31 'Aromatic', 'Hydrophobe', 'LumpedHydrophobe')
32
33 def filterFeature(f):
34 result = f.GetFamily() in keep
35 # TODO - nothing ever seems to be filtered. Is this expected?
36 if not result:
37 utils.log("Filtered out feature type", f.GetFamily())
38 return result
39
40 def getRawFeatures(mol):
41
42 rawFeats = fdef.GetFeaturesForMol(mol)
43 # filter that list down to only include the ones we're interested in
44 filtered = list(filter(filterFeature, rawFeats))
45 return filtered
46
47 def get_FeatureMapScore(small_feats, large_feats, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All):
48 """
49 Generate the feature map score.
50
51 :param small_feats:
52 :param large_feats:
53 :param tani:
54 :return:
55 """
56
57 featLists = []
58 for rawFeats in [small_feats, large_feats]:
59 # filter that list down to only include the ones we're interested in
60 featLists.append(rawFeats)
61 fms = [FeatMaps.FeatMap(feats=x, weights=[1] * len(x), params=fmParams) for x in featLists]
62 # set the score mode
63 fms[0].scoreMode = score_mode
64
65 try:
66 if tani:
67 c = fms[0].ScoreFeats(featLists[1])
68 A = fms[0].GetNumFeatures()
69 B = len(featLists[1])
70 if B != fms[1].GetNumFeatures():
71 utils.log("Why isn't B equal to number of features...?!")
72 tani_score = float(c) / (A+B-c)
73 return tani_score
74 else:
75 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1]))
76 return fm_score
77 except ZeroDivisionError:
78 utils.log("ZeroDivisionError")
79 return 0
80
81 if tani:
82 tani_score = float(c) / (A+B-c)
83 return tani_score
84 else:
85 fm_score = fms[0].ScoreFeats(featLists[1]) / min(fms[0].GetNumFeatures(), len(featLists[1]))
86 return fm_score
87
88
89 def get_SucosScore(ref_mol, query_mol, tani=False, ref_features=None, query_features=None, score_mode=FeatMaps.FeatMapScoreMode.All):
90 """
91 This is the key function that calculates the SuCOS scores and is expected to be called from other modules.
92 To improve performance you can pre-calculate the features and pass them in as optional parameters to avoid having
93 to recalculate them. Use the getRawFeatures function to pre-calculate the features.
94
95 :param ref_mol: The reference molecule to compare to
96 :param query_mol: The molecule to align to the reference
97 :param tani: Whether to calculate Tanimoto distances
98 :param ref_features: An optional feature map for the reference molecule, avoiding the need to re-calculate it.
99 :param query_features: An optional feature map for the query molecule, avoiding the need to re-calculate it.
100 :return: A tuple of 3 values. 1 the sucos score, 2 the feature map score,
101 3 the Tanimoto distance or 1 minus the protrude distance
102 """
103
104 if not ref_features:
105 ref_features = getRawFeatures(ref_mol)
106 if not query_features:
107 query_features = getRawFeatures(query_mol)
108
109 fm_score = get_FeatureMapScore(ref_features, query_features, tani, score_mode)
110 fm_score = np.clip(fm_score, 0, 1)
111
112 if tani:
113 tani_sim = 1 - float(rdShapeHelpers.ShapeTanimotoDist(ref_mol, query_mol))
114 tani_sim = np.clip(tani_sim, 0, 1)
115 SuCOS_score = 0.5*fm_score + 0.5*tani_sim
116 return SuCOS_score, fm_score, tani_sim
117 else:
118 protrude_dist = rdShapeHelpers.ShapeProtrudeDist(ref_mol, query_mol, allowReordering=False)
119 protrude_dist = np.clip(protrude_dist, 0, 1)
120 protrude_val = 1.0 - protrude_dist
121 SuCOS_score = 0.5 * fm_score + 0.5 * protrude_val
122 return SuCOS_score, fm_score, protrude_val
123
124 def process(refmol_filename, inputs_filename, outputs_filename, refmol_index=None,
125 refmol_format=None, tani=False, score_mode=FeatMaps.FeatMapScoreMode.All):
126
127 ref_mol = utils.read_single_molecule(refmol_filename, index=refmol_index, format=refmol_format)
128 #utils.log("Reference mol has", ref_mol.GetNumHeavyAtoms(), "heavy atoms")
129 ref_features = getRawFeatures(ref_mol)
130
131 input_file = utils.open_file_for_reading(inputs_filename)
132 suppl = Chem.ForwardSDMolSupplier(input_file)
133 output_file = utils.open_file_for_writing(outputs_filename)
134 writer = Chem.SDWriter(output_file)
135
136 count = 0
137 total = 0
138 errors = 0
139 for mol in suppl:
140 count +=1
141 if mol is None:
142 continue
143 #utils.log("Mol has", str(mol.GetNumHeavyAtoms()), "heavy atoms")
144 try:
145 sucos_score, fm_score, val3 = get_SucosScore(ref_mol, mol, tani=tani, ref_features=ref_features, score_mode=score_mode)
146 mol.SetDoubleProp("SuCOS_Score", sucos_score)
147 mol.SetDoubleProp("SuCOS_FeatureMap_Score", fm_score)
148 if tani:
149 mol.SetDoubleProp("SuCOS_Tanimoto_Score", val3)
150 else:
151 mol.SetDoubleProp("SuCOS_Protrude_Score", val3)
152 utils.log("Scores:", sucos_score, fm_score, val3)
153 writer.write(mol)
154 total +=1
155 except ValueError as e:
156 errors +=1
157 utils.log("Molecule", count, "failed to score:", e.message)
158
159 input_file.close()
160 writer.flush()
161 writer.close()
162 output_file.close()
163
164 utils.log("Completed.", total, "processed, ", count, "succeeded, ", errors, "errors")
165
166 def parse_score_mode(value):
167 if value == None or value == 'all':
168 return FeatMaps.FeatMapScoreMode.All
169 elif value == 'closest':
170 return FeatMaps.FeatMapScoreMode.Closest
171 elif value == 'best':
172 return FeatMaps.FeatMapScoreMode.Best
173 else:
174 raise ValueError(value + " is not a valid scoring mode option")
175
176
177 ### start main execution #########################################
178
179 def main():
180
181 parser = argparse.ArgumentParser(description='SuCOS with RDKit')
182 parser.add_argument('-i', '--input', help='Input file in SDF format. Can be gzipped (*.gz).')
183 parser.add_argument('-r', '--refmol', help='Molecule to compare against in Molfile (.mol) or SDF (.sdf) format')
184 parser.add_argument('--refmol-format', help="Format for the reference molecule (mol or sdf). " +
185 "Only needed if files don't have the expected extensions")
186 parser.add_argument('--refmolidx', help='Reference molecule index in SD file if not the first', type=int, default=1)
187 parser.add_argument('-o', '--output', help='Output file in SDF format. Can be gzipped (*.gz).')
188 parser.add_argument('--tanimoto', action='store_true', help='Include Tanimoto distance in score')
189 parser.add_argument('--score_mode', choices=['all', 'closest', 'best'],
190 help="choose the scoring mode for the feature map, default is 'all'.")
191
192 args = parser.parse_args()
193 utils.log("SuCOS Args: ", args)
194
195 score_mode = parse_score_mode(args.score_mode)
196
197 process(args.refmol, args.input, args.output, refmol_index=args.refmolidx,
198 refmol_format=args.refmol_format, tani=args.tanimoto, score_mode=score_mode)
199
200
201 if __name__ == "__main__":
202 main()