Mercurial > repos > bgruening > qed
comparison qed.py @ 0:5ccd3a432785 draft
planemo upload for repository https://github.com/bgruening/galaxytools/tree/master/chemicaltoolbox/silicos-it/qed commit 4379e712f76f2bb12ee2cc270dd8a0e806df2cd6
author | bgruening |
---|---|
date | Tue, 23 May 2017 03:57:14 -0400 |
parents | |
children | ab73abead7fa |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:5ccd3a432785 |
---|---|
1 #!/usr/bin/env python | |
2 __all__ = ['weights_max', 'weights_mean', 'weights_none', 'default'] | |
3 | |
4 # RDKit | |
5 from rdkit.Chem import Descriptors | |
6 from rdkit import Chem | |
7 | |
8 # General | |
9 from copy import deepcopy | |
10 from math import exp, log | |
11 import sys, os, re | |
12 import argparse | |
13 | |
14 | |
15 class SilicosItError(Exception): | |
16 """Base class for exceptions in Silicos-it code""" | |
17 pass | |
18 | |
19 class WrongArgument(SilicosItError): | |
20 """ | |
21 Exception raised when argument to function is not of correct type. | |
22 | |
23 Attributes: | |
24 function -- function in which error occurred | |
25 msg -- explanation of the error | |
26 """ | |
27 def __init__(self, function, msg): | |
28 self.function = function | |
29 self.msg = msg | |
30 | |
31 def check_filetype(filepath): | |
32 mol = False | |
33 possible_inchi = True | |
34 for line_counter, line in enumerate(open(filepath)): | |
35 if line_counter > 10000: | |
36 break | |
37 if line.find('$$$$') != -1: | |
38 return 'sdf' | |
39 elif line.find('@<TRIPOS>MOLECULE') != -1: | |
40 return 'mol2' | |
41 elif line.find('ligand id') != -1: | |
42 return 'drf' | |
43 elif possible_inchi and re.findall('^InChI=', line): | |
44 return 'inchi' | |
45 elif re.findall('^M\s+END', line): | |
46 mol = True | |
47 # first line is not an InChI, so it can't be an InChI file | |
48 possible_inchi = False | |
49 | |
50 if mol: | |
51 # END can occures before $$$$, so and SDF file will | |
52 # be recognised as mol, if you not using this hack' | |
53 return 'mol' | |
54 return 'smi' | |
55 | |
56 AliphaticRings = Chem.MolFromSmarts('[$([A;R][!a])]') | |
57 | |
58 AcceptorSmarts = [ | |
59 '[oH0;X2]', | |
60 '[OH1;X2;v2]', | |
61 '[OH0;X2;v2]', | |
62 '[OH0;X1;v2]', | |
63 '[O-;X1]', | |
64 '[SH0;X2;v2]', | |
65 '[SH0;X1;v2]', | |
66 '[S-;X1]', | |
67 '[nH0;X2]', | |
68 '[NH0;X1;v3]', | |
69 '[$([N;+0;X3;v3]);!$(N[C,S]=O)]' | |
70 ] | |
71 Acceptors = [] | |
72 for hba in AcceptorSmarts: | |
73 Acceptors.append(Chem.MolFromSmarts(hba)) | |
74 | |
75 StructuralAlertSmarts = [ | |
76 '*1[O,S,N]*1', | |
77 '[S,C](=[O,S])[F,Br,Cl,I]', | |
78 '[CX4][Cl,Br,I]', | |
79 '[C,c]S(=O)(=O)O[C,c]', | |
80 '[$([CH]),$(CC)]#CC(=O)[C,c]', | |
81 '[$([CH]),$(CC)]#CC(=O)O[C,c]', | |
82 'n[OH]', | |
83 '[$([CH]),$(CC)]#CS(=O)(=O)[C,c]', | |
84 'C=C(C=O)C=O', | |
85 'n1c([F,Cl,Br,I])cccc1', | |
86 '[CH1](=O)', | |
87 '[O,o][O,o]', | |
88 '[C;!R]=[N;!R]', | |
89 '[N!R]=[N!R]', | |
90 '[#6](=O)[#6](=O)', | |
91 '[S,s][S,s]', | |
92 '[N,n][NH2]', | |
93 'C(=O)N[NH2]', | |
94 '[C,c]=S', | |
95 '[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]=[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]', | |
96 'C1(=[O,N])C=CC(=[O,N])C=C1', | |
97 'C1(=[O,N])C(=[O,N])C=CC=C1', | |
98 'a21aa3a(aa1aaaa2)aaaa3', | |
99 'a31a(a2a(aa1)aaaa2)aaaa3', | |
100 'a1aa2a3a(a1)A=AA=A3=AA=A2', | |
101 'c1cc([NH2])ccc1', | |
102 '[Hg,Fe,As,Sb,Zn,Se,se,Te,B,Si,Na,Ca,Ge,Ag,Mg,K,Ba,Sr,Be,Ti,Mo,Mn,Ru,Pd,Ni,Cu,Au,Cd,Al,Ga,Sn,Rh,Tl,Bi,Nb,Li,Pb,Hf,Ho]', | |
103 'I', | |
104 'OS(=O)(=O)[O-]', | |
105 '[N+](=O)[O-]', | |
106 'C(=O)N[OH]', | |
107 'C1NC(=O)NC(=O)1', | |
108 '[SH]', | |
109 '[S-]', | |
110 'c1ccc([Cl,Br,I,F])c([Cl,Br,I,F])c1[Cl,Br,I,F]', | |
111 'c1cc([Cl,Br,I,F])cc([Cl,Br,I,F])c1[Cl,Br,I,F]', | |
112 '[CR1]1[CR1][CR1][CR1][CR1][CR1][CR1]1', | |
113 '[CR1]1[CR1][CR1]cc[CR1][CR1]1', | |
114 '[CR2]1[CR2][CR2][CR2][CR2][CR2][CR2][CR2]1', | |
115 '[CR2]1[CR2][CR2]cc[CR2][CR2][CR2]1', | |
116 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1', | |
117 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1', | |
118 'C#C', | |
119 '[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]', | |
120 '[$([N+R]),$([n+R]),$([N+]=C)][O-]', | |
121 '[C,c]=N[OH]', | |
122 '[C,c]=NOC=O', | |
123 '[C,c](=O)[CX4,CR0X3,O][C,c](=O)', | |
124 'c1ccc2c(c1)ccc(=O)o2', | |
125 '[O+,o+,S+,s+]', | |
126 'N=C=O', | |
127 '[NX3,NX4][F,Cl,Br,I]', | |
128 'c1ccccc1OC(=O)[#6]', | |
129 '[CR0]=[CR0][CR0]=[CR0]', | |
130 '[C+,c+,C-,c-]', | |
131 'N=[N+]=[N-]', | |
132 'C12C(NC(N1)=O)CSC2', | |
133 'c1c([OH])c([OH,NH2,NH])ccc1', | |
134 'P', | |
135 '[N,O,S]C#N', | |
136 'C=C=O', | |
137 '[Si][F,Cl,Br,I]', | |
138 '[SX2]O', | |
139 '[SiR0,CR0](c1ccccc1)(c2ccccc2)(c3ccccc3)', | |
140 'O1CCCCC1OC2CCC3CCCCC3C2', | |
141 'N=[CR0][N,n,O,S]', | |
142 '[cR2]1[cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2][cR2]1[cR2]2[cR2][cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2]2', | |
143 'C=[C!r]C#N', | |
144 '[cR2]1[cR2]c([N+0X3R0,nX3R0])c([N+0X3R0,nX3R0])[cR2][cR2]1', | |
145 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2]c([N+0X3R0,nX3R0])[cR2]1', | |
146 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2][cR2]c1([N+0X3R0,nX3R0])', | |
147 '[OH]c1ccc([OH,NH2,NH])cc1', | |
148 'c1ccccc1OC(=O)O', | |
149 '[SX2H0][N]', | |
150 'c12ccccc1(SC(S)=N2)', | |
151 'c12ccccc1(SC(=S)N2)', | |
152 'c1nnnn1C=O', | |
153 's1c(S)nnc1NC=O', | |
154 'S1C=CSC1=S', | |
155 'C(=O)Onnn', | |
156 'OS(=O)(=O)C(F)(F)F', | |
157 'N#CC[OH]', | |
158 'N#CC(=O)', | |
159 'S(=O)(=O)C#N', | |
160 'N[CH2]C#N', | |
161 'C1(=O)NCC1', | |
162 'S(=O)(=O)[O-,OH]', | |
163 'NC[F,Cl,Br,I]', | |
164 'C=[C!r]O', | |
165 '[NX2+0]=[O+0]', | |
166 '[OR0,NR0][OR0,NR0]', | |
167 'C(=O)O[C,H1].C(=O)O[C,H1].C(=O)O[C,H1]', | |
168 '[CX2R0][NX3R0]', | |
169 'c1ccccc1[C;!R]=[C;!R]c2ccccc2', | |
170 '[NX3R0,NX4R0,OR0,SX2R0][CX4][NX3R0,NX4R0,OR0,SX2R0]', | |
171 '[s,S,c,C,n,N,o,O]~[n+,N+](~[s,S,c,C,n,N,o,O])(~[s,S,c,C,n,N,o,O])~[s,S,c,C,n,N,o,O]', | |
172 '[s,S,c,C,n,N,o,O]~[nX3+,NX3+](~[s,S,c,C,n,N])~[s,S,c,C,n,N]', | |
173 '[*]=[N+]=[*]', | |
174 '[SX3](=O)[O-,OH]', | |
175 'N#N', | |
176 'F.F.F.F', | |
177 '[R0;D2][R0;D2][R0;D2][R0;D2]', | |
178 '[cR,CR]~C(=O)NC(=O)~[cR,CR]', | |
179 'C=!@CC=[O,S]', | |
180 '[#6,#8,#16][C,c](=O)O[C,c]', | |
181 'c[C;R0](=[O,S])[C,c]', | |
182 'c[SX2][C;!R]', | |
183 'C=C=C', | |
184 'c1nc([F,Cl,Br,I,S])ncc1', | |
185 'c1ncnc([F,Cl,Br,I,S])c1', | |
186 'c1nc(c2c(n1)nc(n2)[F,Cl,Br,I])', | |
187 '[C,c]S(=O)(=O)c1ccc(cc1)F', | |
188 '[15N]', | |
189 '[13C]', | |
190 '[18O]', | |
191 '[34S]' | |
192 ] | |
193 | |
194 StructuralAlerts = [] | |
195 for smarts in StructuralAlertSmarts: | |
196 StructuralAlerts.append(Chem.MolFromSmarts(smarts)) | |
197 | |
198 | |
199 # ADS parameters for the 8 molecular properties: [row][column] | |
200 # rows[8]: MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS | |
201 # columns[7]: A, B, C, D, E, F, DMAX | |
202 # ALOGP parameters from Gregory Gerebtzoff (2012, Roche) | |
203 pads1 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561], | |
204 [0.486849448, 186.2293718, 2.066177165, 3.902720615, 1.027025453, 0.913012565, 145.4314800], | |
205 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046], | |
206 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616], | |
207 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167], | |
208 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403], | |
209 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610], | |
210 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ] | |
211 # ALOGP parameters from the original publication | |
212 pads2 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561], | |
213 [3.172690585, 137.8624751, 2.534937431, 4.581497897, 0.822739154, 0.576295591, 131.3186604], | |
214 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046], | |
215 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616], | |
216 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167], | |
217 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403], | |
218 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610], | |
219 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ] | |
220 | |
221 def ads(x, a, b, c, d, e, f, dmax): | |
222 return ((a+(b/(1+exp(-1*(x-c+d/2)/e))*(1-1/(1+exp(-1*(x-c-d/2)/f))))) / dmax) | |
223 | |
224 def properties(mol): | |
225 """ | |
226 Calculates the properties that are required to calculate the QED descriptor. | |
227 """ | |
228 matches = [] | |
229 if mol is None: | |
230 raise WrongArgument("properties(mol)", "mol argument is \'None\'") | |
231 x = [0] * 9 | |
232 x[0] = Descriptors.MolWt(mol) # MW | |
233 x[1] = Descriptors.MolLogP(mol) # ALOGP | |
234 for hba in Acceptors: # HBA | |
235 if mol.HasSubstructMatch(hba): | |
236 matches = mol.GetSubstructMatches(hba) | |
237 x[2] += len(matches) | |
238 x[3] = Descriptors.NumHDonors(mol) # HBD | |
239 x[4] = Descriptors.TPSA(mol) # PSA | |
240 x[5] = Descriptors.NumRotatableBonds(mol) # ROTB | |
241 x[6] = Chem.GetSSSR(Chem.DeleteSubstructs(deepcopy(mol), AliphaticRings)) # AROM | |
242 for alert in StructuralAlerts: # ALERTS | |
243 if (mol.HasSubstructMatch(alert)): x[7] += 1 | |
244 ro5_failed = 0 | |
245 if x[3] > 5: | |
246 ro5_failed += 1 #HBD | |
247 if x[2] > 10: | |
248 ro5_failed += 1 #HBA | |
249 if x[0] >= 500: | |
250 ro5_failed += 1 | |
251 if x[1] > 5: | |
252 ro5_failed += 1 | |
253 x[8] = ro5_failed | |
254 return x | |
255 | |
256 | |
257 def qed(w, p, gerebtzoff): | |
258 d = [0.00] * 8 | |
259 if gerebtzoff: | |
260 for i in range(0, 8): | |
261 d[i] = ads(p[i], pads1[i][0], pads1[i][1], pads1[i][2], pads1[i][3], pads1[i][4], pads1[i][5], pads1[i][6]) | |
262 else: | |
263 for i in range(0, 8): | |
264 d[i] = ads(p[i], pads2[i][0], pads2[i][1], pads2[i][2], pads2[i][3], pads2[i][4], pads2[i][5], pads2[i][6]) | |
265 t = 0.0 | |
266 for i in range(0, 8): | |
267 t += w[i] * log(d[i]) | |
268 return (exp(t / sum(w))) | |
269 | |
270 | |
271 def weights_max(mol, gerebtzoff = True, props = False): | |
272 """ | |
273 Calculates the QED descriptor using maximal descriptor weights. | |
274 If props is specified we skip the calculation step and use the props-list of properties. | |
275 """ | |
276 if not props: | |
277 props = properties(mol) | |
278 return qed([0.50, 0.25, 0.00, 0.50, 0.00, 0.50, 0.25, 1.00], props, gerebtzoff) | |
279 | |
280 | |
281 def weights_mean(mol, gerebtzoff = True, props = False): | |
282 """ | |
283 Calculates the QED descriptor using average descriptor weights. | |
284 If props is specified we skip the calculation step and use the props-list of properties. | |
285 """ | |
286 if not props: | |
287 props = properties(mol) | |
288 return qed([0.66, 0.46, 0.05, 0.61, 0.06, 0.65, 0.48, 0.95], props, gerebtzoff) | |
289 | |
290 | |
291 def weights_none(mol, gerebtzoff = True, props = False): | |
292 """ | |
293 Calculates the QED descriptor using unit weights. | |
294 If props is specified we skip the calculation step and use the props-list of properties. | |
295 """ | |
296 if not props: | |
297 props = properties(mol) | |
298 return qed([1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00], props, gerebtzoff) | |
299 | |
300 | |
301 def default(mol, gerebtzoff = True): | |
302 """ | |
303 Calculates the QED descriptor using average descriptor weights and Gregory Gerebtzoff parameters. | |
304 """ | |
305 return weights_mean(mol, gerebtzoff) | |
306 | |
307 | |
308 if __name__ == "__main__": | |
309 parser = argparse.ArgumentParser() | |
310 parser.add_argument('-i', '--input', | |
311 required=True, | |
312 help='path to the input file name') | |
313 parser.add_argument("-m", "--method", | |
314 dest="method", | |
315 choices=['max', 'mean', 'unweighted'], | |
316 default="mean", | |
317 help="Specify the method you want to use.") | |
318 parser.add_argument("--iformat", | |
319 help="Input format. It must be supported by openbabel.") | |
320 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'), | |
321 default=sys.stdout, | |
322 help="path to the result file, default it sdtout") | |
323 parser.add_argument("--header", dest="header", action="store_true", | |
324 default=False, | |
325 help="Write header line.") | |
326 | |
327 | |
328 args = parser.parse_args() | |
329 | |
330 # Elucidate filetype and open supplier | |
331 ifile = os.path.abspath(args.input) | |
332 if not os.path.isfile(ifile): | |
333 print "Error: ", ifile, " is not a file or cannot be found." | |
334 sys.exit(1) | |
335 if not os.path.exists(ifile): | |
336 print "Error: ", ifile, " does not exist or cannot be found." | |
337 sys.exit(1) | |
338 if not os.access(ifile, os.R_OK): | |
339 print "Error: ", ifile, " is not readable." | |
340 sys.exit(1) | |
341 | |
342 if not args.iformat: | |
343 # try to guess the filetype | |
344 filetype = check_filetype( ifile ) | |
345 else: | |
346 filetype = args.iformat # sdf or smi | |
347 | |
348 | |
349 """ | |
350 We want to store the original SMILES in the output. So in case of a SMILES file iterate over the file and convert each line separate. | |
351 """ | |
352 if filetype == 'sdf': | |
353 supplier = Chem.SDMolSupplier( ifile ) | |
354 # Process file | |
355 if args.header: | |
356 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\n") | |
357 count = 0 | |
358 for mol in supplier: | |
359 count += 1 | |
360 if mol is None: | |
361 print "Warning: skipping molecule ", count, " and continuing with next." | |
362 continue | |
363 props = properties(mol) | |
364 | |
365 if args.method == 'max': | |
366 calc_qed = weights_max(mol, True, props) | |
367 elif args.method == 'unweighted': | |
368 calc_qed = weights_none(mol, True, props) | |
369 else: | |
370 calc_qed = weights_mean(mol, True, props) | |
371 | |
372 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\n" % ( | |
373 props[0], | |
374 props[1], | |
375 props[2], | |
376 props[3], | |
377 props[4], | |
378 props[5], | |
379 props[6], | |
380 props[7], | |
381 props[8], | |
382 calc_qed, | |
383 mol.GetProp("_Name"), | |
384 )) | |
385 elif filetype == 'smi': | |
386 supplier = Chem.SmilesMolSupplier( ifile, " \t", 0, 1, False, True ) | |
387 | |
388 # Process file | |
389 if args.header: | |
390 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\tSMILES\n") | |
391 count = 0 | |
392 for line in open(ifile): | |
393 tokens = line.strip().split('\t') | |
394 if len(tokens) > 1: | |
395 smiles, title = tokens | |
396 else: | |
397 smiles = tokens[0] | |
398 title = '' | |
399 mol = Chem.MolFromSmiles(smiles) | |
400 count += 1 | |
401 if mol is None: | |
402 print "Warning: skipping molecule ", count, " and continuing with next." | |
403 continue | |
404 props = properties(mol) | |
405 | |
406 if args.method == 'max': | |
407 calc_qed = weights_max(mol, True, props) | |
408 elif args.method == 'unweighted': | |
409 calc_qed = weights_none(mol, True, props) | |
410 else: | |
411 calc_qed = weights_mean(mol, True, props) | |
412 | |
413 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\t%s\n" % ( | |
414 props[0], | |
415 props[1], | |
416 props[2], | |
417 props[3], | |
418 props[4], | |
419 props[5], | |
420 props[6], | |
421 props[7], | |
422 props[8], | |
423 calc_qed, | |
424 title, | |
425 smiles | |
426 )) | |
427 else: | |
428 sys.exit("Error: unknown file-type: %s" % filetype) |