0
|
1 #!/usr/bin/env python
|
|
2 __all__ = ['weights_max', 'weights_mean', 'weights_none', 'default']
|
|
3
|
|
4 # Silicos-it
|
|
5 from errors import WrongArgument
|
|
6
|
|
7 # RDKit
|
|
8 from rdkit.Chem import Descriptors
|
|
9 from rdkit import Chem
|
|
10
|
|
11 # General
|
|
12 from copy import deepcopy
|
|
13 from math import exp, log
|
|
14 import sys, os, re
|
|
15 import argparse
|
|
16
|
|
17 def check_filetype(filepath):
|
|
18 mol = False
|
|
19 possible_inchi = True
|
|
20 for line_counter, line in enumerate(open(filepath)):
|
|
21 if line_counter > 10000:
|
|
22 break
|
|
23 if line.find('$$$$') != -1:
|
|
24 return 'sdf'
|
|
25 elif line.find('@<TRIPOS>MOLECULE') != -1:
|
|
26 return 'mol2'
|
|
27 elif line.find('ligand id') != -1:
|
|
28 return 'drf'
|
|
29 elif possible_inchi and re.findall('^InChI=', line):
|
|
30 return 'inchi'
|
|
31 elif re.findall('^M\s+END', line):
|
|
32 mol = True
|
|
33 # first line is not an InChI, so it can't be an InChI file
|
|
34 possible_inchi = False
|
|
35
|
|
36 if mol:
|
|
37 # END can occures before $$$$, so and SDF file will
|
|
38 # be recognised as mol, if you not using this hack'
|
|
39 return 'mol'
|
|
40 return 'smi'
|
|
41
|
|
42 AliphaticRings = Chem.MolFromSmarts('[$([A;R][!a])]')
|
|
43
|
|
44 AcceptorSmarts = [
|
|
45 '[oH0;X2]',
|
|
46 '[OH1;X2;v2]',
|
|
47 '[OH0;X2;v2]',
|
|
48 '[OH0;X1;v2]',
|
|
49 '[O-;X1]',
|
|
50 '[SH0;X2;v2]',
|
|
51 '[SH0;X1;v2]',
|
|
52 '[S-;X1]',
|
|
53 '[nH0;X2]',
|
|
54 '[NH0;X1;v3]',
|
|
55 '[$([N;+0;X3;v3]);!$(N[C,S]=O)]'
|
|
56 ]
|
|
57 Acceptors = []
|
|
58 for hba in AcceptorSmarts:
|
|
59 Acceptors.append(Chem.MolFromSmarts(hba))
|
|
60
|
|
61 StructuralAlertSmarts = [
|
|
62 '*1[O,S,N]*1',
|
|
63 '[S,C](=[O,S])[F,Br,Cl,I]',
|
|
64 '[CX4][Cl,Br,I]',
|
|
65 '[C,c]S(=O)(=O)O[C,c]',
|
|
66 '[$([CH]),$(CC)]#CC(=O)[C,c]',
|
|
67 '[$([CH]),$(CC)]#CC(=O)O[C,c]',
|
|
68 'n[OH]',
|
|
69 '[$([CH]),$(CC)]#CS(=O)(=O)[C,c]',
|
|
70 'C=C(C=O)C=O',
|
|
71 'n1c([F,Cl,Br,I])cccc1',
|
|
72 '[CH1](=O)',
|
|
73 '[O,o][O,o]',
|
|
74 '[C;!R]=[N;!R]',
|
|
75 '[N!R]=[N!R]',
|
|
76 '[#6](=O)[#6](=O)',
|
|
77 '[S,s][S,s]',
|
|
78 '[N,n][NH2]',
|
|
79 'C(=O)N[NH2]',
|
|
80 '[C,c]=S',
|
|
81 '[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]=[$([CH2]),$([CH][CX4]),$(C([CX4])[CX4])]',
|
|
82 'C1(=[O,N])C=CC(=[O,N])C=C1',
|
|
83 'C1(=[O,N])C(=[O,N])C=CC=C1',
|
|
84 'a21aa3a(aa1aaaa2)aaaa3',
|
|
85 'a31a(a2a(aa1)aaaa2)aaaa3',
|
|
86 'a1aa2a3a(a1)A=AA=A3=AA=A2',
|
|
87 'c1cc([NH2])ccc1',
|
|
88 '[Hg,Fe,As,Sb,Zn,Se,se,Te,B,Si,Na,Ca,Ge,Ag,Mg,K,Ba,Sr,Be,Ti,Mo,Mn,Ru,Pd,Ni,Cu,Au,Cd,Al,Ga,Sn,Rh,Tl,Bi,Nb,Li,Pb,Hf,Ho]',
|
|
89 'I',
|
|
90 'OS(=O)(=O)[O-]',
|
|
91 '[N+](=O)[O-]',
|
|
92 'C(=O)N[OH]',
|
|
93 'C1NC(=O)NC(=O)1',
|
|
94 '[SH]',
|
|
95 '[S-]',
|
|
96 'c1ccc([Cl,Br,I,F])c([Cl,Br,I,F])c1[Cl,Br,I,F]',
|
|
97 'c1cc([Cl,Br,I,F])cc([Cl,Br,I,F])c1[Cl,Br,I,F]',
|
|
98 '[CR1]1[CR1][CR1][CR1][CR1][CR1][CR1]1',
|
|
99 '[CR1]1[CR1][CR1]cc[CR1][CR1]1',
|
|
100 '[CR2]1[CR2][CR2][CR2][CR2][CR2][CR2][CR2]1',
|
|
101 '[CR2]1[CR2][CR2]cc[CR2][CR2][CR2]1',
|
|
102 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1',
|
|
103 '[CH2R2]1N[CH2R2][CH2R2][CH2R2][CH2R2][CH2R2][CH2R2]1',
|
|
104 'C#C',
|
|
105 '[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]@[CR2]@[CR2]@[OR2,NR2]',
|
|
106 '[$([N+R]),$([n+R]),$([N+]=C)][O-]',
|
|
107 '[C,c]=N[OH]',
|
|
108 '[C,c]=NOC=O',
|
|
109 '[C,c](=O)[CX4,CR0X3,O][C,c](=O)',
|
|
110 'c1ccc2c(c1)ccc(=O)o2',
|
|
111 '[O+,o+,S+,s+]',
|
|
112 'N=C=O',
|
|
113 '[NX3,NX4][F,Cl,Br,I]',
|
|
114 'c1ccccc1OC(=O)[#6]',
|
|
115 '[CR0]=[CR0][CR0]=[CR0]',
|
|
116 '[C+,c+,C-,c-]',
|
|
117 'N=[N+]=[N-]',
|
|
118 'C12C(NC(N1)=O)CSC2',
|
|
119 'c1c([OH])c([OH,NH2,NH])ccc1',
|
|
120 'P',
|
|
121 '[N,O,S]C#N',
|
|
122 'C=C=O',
|
|
123 '[Si][F,Cl,Br,I]',
|
|
124 '[SX2]O',
|
|
125 '[SiR0,CR0](c1ccccc1)(c2ccccc2)(c3ccccc3)',
|
|
126 'O1CCCCC1OC2CCC3CCCCC3C2',
|
|
127 'N=[CR0][N,n,O,S]',
|
|
128 '[cR2]1[cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2][cR2]1[cR2]2[cR2][cR2][cR2]([Nv3X3,Nv4X4])[cR2][cR2]2',
|
|
129 'C=[C!r]C#N',
|
|
130 '[cR2]1[cR2]c([N+0X3R0,nX3R0])c([N+0X3R0,nX3R0])[cR2][cR2]1',
|
|
131 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2]c([N+0X3R0,nX3R0])[cR2]1',
|
|
132 '[cR2]1[cR2]c([N+0X3R0,nX3R0])[cR2][cR2]c1([N+0X3R0,nX3R0])',
|
|
133 '[OH]c1ccc([OH,NH2,NH])cc1',
|
|
134 'c1ccccc1OC(=O)O',
|
|
135 '[SX2H0][N]',
|
|
136 'c12ccccc1(SC(S)=N2)',
|
|
137 'c12ccccc1(SC(=S)N2)',
|
|
138 'c1nnnn1C=O',
|
|
139 's1c(S)nnc1NC=O',
|
|
140 'S1C=CSC1=S',
|
|
141 'C(=O)Onnn',
|
|
142 'OS(=O)(=O)C(F)(F)F',
|
|
143 'N#CC[OH]',
|
|
144 'N#CC(=O)',
|
|
145 'S(=O)(=O)C#N',
|
|
146 'N[CH2]C#N',
|
|
147 'C1(=O)NCC1',
|
|
148 'S(=O)(=O)[O-,OH]',
|
|
149 'NC[F,Cl,Br,I]',
|
|
150 'C=[C!r]O',
|
|
151 '[NX2+0]=[O+0]',
|
|
152 '[OR0,NR0][OR0,NR0]',
|
|
153 'C(=O)O[C,H1].C(=O)O[C,H1].C(=O)O[C,H1]',
|
|
154 '[CX2R0][NX3R0]',
|
|
155 'c1ccccc1[C;!R]=[C;!R]c2ccccc2',
|
|
156 '[NX3R0,NX4R0,OR0,SX2R0][CX4][NX3R0,NX4R0,OR0,SX2R0]',
|
|
157 '[s,S,c,C,n,N,o,O]~[n+,N+](~[s,S,c,C,n,N,o,O])(~[s,S,c,C,n,N,o,O])~[s,S,c,C,n,N,o,O]',
|
|
158 '[s,S,c,C,n,N,o,O]~[nX3+,NX3+](~[s,S,c,C,n,N])~[s,S,c,C,n,N]',
|
|
159 '[*]=[N+]=[*]',
|
|
160 '[SX3](=O)[O-,OH]',
|
|
161 'N#N',
|
|
162 'F.F.F.F',
|
|
163 '[R0;D2][R0;D2][R0;D2][R0;D2]',
|
|
164 '[cR,CR]~C(=O)NC(=O)~[cR,CR]',
|
|
165 'C=!@CC=[O,S]',
|
|
166 '[#6,#8,#16][C,c](=O)O[C,c]',
|
|
167 'c[C;R0](=[O,S])[C,c]',
|
|
168 'c[SX2][C;!R]',
|
|
169 'C=C=C',
|
|
170 'c1nc([F,Cl,Br,I,S])ncc1',
|
|
171 'c1ncnc([F,Cl,Br,I,S])c1',
|
|
172 'c1nc(c2c(n1)nc(n2)[F,Cl,Br,I])',
|
|
173 '[C,c]S(=O)(=O)c1ccc(cc1)F',
|
|
174 '[15N]',
|
|
175 '[13C]',
|
|
176 '[18O]',
|
|
177 '[34S]'
|
|
178 ]
|
|
179
|
|
180 StructuralAlerts = []
|
|
181 for smarts in StructuralAlertSmarts:
|
|
182 StructuralAlerts.append(Chem.MolFromSmarts(smarts))
|
|
183
|
|
184
|
|
185 # ADS parameters for the 8 molecular properties: [row][column]
|
|
186 # rows[8]: MW, ALOGP, HBA, HBD, PSA, ROTB, AROM, ALERTS
|
|
187 # columns[7]: A, B, C, D, E, F, DMAX
|
|
188 # ALOGP parameters from Gregory Gerebtzoff (2012, Roche)
|
|
189 pads1 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561],
|
|
190 [0.486849448, 186.2293718, 2.066177165, 3.902720615, 1.027025453, 0.913012565, 145.4314800],
|
|
191 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046],
|
|
192 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616],
|
|
193 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167],
|
|
194 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403],
|
|
195 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610],
|
|
196 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ]
|
|
197 # ALOGP parameters from the original publication
|
|
198 pads2 = [ [2.817065973, 392.5754953, 290.7489764, 2.419764353, 49.22325677, 65.37051707, 104.9805561],
|
|
199 [3.172690585, 137.8624751, 2.534937431, 4.581497897, 0.822739154, 0.576295591, 131.3186604],
|
|
200 [2.948620388, 160.4605972, 3.615294657, 4.435986202, 0.290141953, 1.300669958, 148.7763046],
|
|
201 [1.618662227, 1010.051101, 0.985094388, 0.000000001, 0.713820843, 0.920922555, 258.1632616],
|
|
202 [1.876861559, 125.2232657, 62.90773554, 87.83366614, 12.01999824, 28.51324732, 104.5686167],
|
|
203 [0.010000000, 272.4121427, 2.558379970, 1.565547684, 1.271567166, 2.758063707, 105.4420403],
|
|
204 [3.217788970, 957.7374108, 2.274627939, 0.000000001, 1.317690384, 0.375760881, 312.3372610],
|
|
205 [0.010000000, 1199.094025, -0.09002883, 0.000000001, 0.185904477, 0.875193782, 417.7253140] ]
|
|
206
|
|
207 def ads(x, a, b, c, d, e, f, dmax):
|
|
208 return ((a+(b/(1+exp(-1*(x-c+d/2)/e))*(1-1/(1+exp(-1*(x-c-d/2)/f))))) / dmax)
|
|
209
|
|
210 def properties(mol):
|
|
211 """
|
|
212 Calculates the properties that are required to calculate the QED descriptor.
|
|
213 """
|
|
214 matches = []
|
|
215 if (mol is None):
|
|
216 raise WrongArgument("properties(mol)", "mol argument is \'None\'")
|
|
217 x = [0] * 9
|
|
218 x[0] = Descriptors.MolWt(mol) # MW
|
|
219 x[1] = Descriptors.MolLogP(mol) # ALOGP
|
|
220 for hba in Acceptors: # HBA
|
|
221 if (mol.HasSubstructMatch(hba)):
|
|
222 matches = mol.GetSubstructMatches(hba)
|
|
223 x[2] += len(matches)
|
|
224 x[3] = Descriptors.NumHDonors(mol) # HBD
|
|
225 x[4] = Descriptors.TPSA(mol) # PSA
|
|
226 x[5] = Descriptors.NumRotatableBonds(mol) # ROTB
|
|
227 x[6] = Chem.GetSSSR(Chem.DeleteSubstructs(deepcopy(mol), AliphaticRings)) # AROM
|
|
228 for alert in StructuralAlerts: # ALERTS
|
|
229 if (mol.HasSubstructMatch(alert)): x[7] += 1
|
|
230 ro5_failed = 0
|
|
231 if x[3] > 5:
|
|
232 ro5_failed += 1 #HBD
|
|
233 if x[2] > 10:
|
|
234 ro5_failed += 1 #HBA
|
|
235 if x[0] >= 500:
|
|
236 ro5_failed += 1
|
|
237 if x[1] > 5:
|
|
238 ro5_failed += 1
|
|
239 x[8] = ro5_failed
|
|
240 return x
|
|
241
|
|
242
|
|
243 def qed(w, p, gerebtzoff):
|
|
244 d = [0.00] * 8
|
|
245 if gerebtzoff:
|
|
246 for i in range(0, 8):
|
|
247 d[i] = ads(p[i], pads1[i][0], pads1[i][1], pads1[i][2], pads1[i][3], pads1[i][4], pads1[i][5], pads1[i][6])
|
|
248 else:
|
|
249 for i in range(0, 8):
|
|
250 d[i] = ads(p[i], pads2[i][0], pads2[i][1], pads2[i][2], pads2[i][3], pads2[i][4], pads2[i][5], pads2[i][6])
|
|
251 t = 0.0
|
|
252 for i in range(0, 8):
|
|
253 t += w[i] * log(d[i])
|
|
254 return (exp(t / sum(w)))
|
|
255
|
|
256
|
|
257 def weights_max(mol, gerebtzoff = True, props = False):
|
|
258 """
|
|
259 Calculates the QED descriptor using maximal descriptor weights.
|
|
260 If props is specified we skip the calculation step and use the props-list of properties.
|
|
261 """
|
|
262 if not props:
|
|
263 props = properties(mol)
|
|
264 return qed([0.50, 0.25, 0.00, 0.50, 0.00, 0.50, 0.25, 1.00], props, gerebtzoff)
|
|
265
|
|
266
|
|
267 def weights_mean(mol, gerebtzoff = True, props = False):
|
|
268 """
|
|
269 Calculates the QED descriptor using average descriptor weights.
|
|
270 If props is specified we skip the calculation step and use the props-list of properties.
|
|
271 """
|
|
272 if not props:
|
|
273 props = properties(mol)
|
|
274 return qed([0.66, 0.46, 0.05, 0.61, 0.06, 0.65, 0.48, 0.95], props, gerebtzoff)
|
|
275
|
|
276
|
|
277 def weights_none(mol, gerebtzoff = True, props = False):
|
|
278 """
|
|
279 Calculates the QED descriptor using unit weights.
|
|
280 If props is specified we skip the calculation step and use the props-list of properties.
|
|
281 """
|
|
282 if not props:
|
|
283 props = properties(mol)
|
|
284 return qed([1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00, 1.00], props, gerebtzoff)
|
|
285
|
|
286
|
|
287 def default(mol, gerebtzoff = True):
|
|
288 """
|
|
289 Calculates the QED descriptor using average descriptor weights and Gregory Gerebtzoff parameters.
|
|
290 """
|
|
291 return weights_mean(mol, gerebtzoff)
|
|
292
|
|
293
|
|
294 if __name__ == "__main__":
|
|
295 parser = argparse.ArgumentParser()
|
|
296 parser.add_argument('-i', '--input',
|
|
297 required=True,
|
|
298 help='path to the input file name')
|
|
299 parser.add_argument("-m", "--method",
|
|
300 dest="method",
|
|
301 choices=['max', 'mean', 'unweighted'],
|
|
302 default="mean",
|
|
303 help="Specify the method you want to use.")
|
|
304 parser.add_argument("--iformat",
|
|
305 help="Input format. It must be supported by openbabel.")
|
|
306 parser.add_argument('-o', '--outfile', type=argparse.FileType('w+'),
|
|
307 default=sys.stdout,
|
|
308 help="path to the result file, default it sdtout")
|
|
309 parser.add_argument("--header", dest="header", action="store_true",
|
|
310 default=False,
|
|
311 help="Write header line.")
|
|
312
|
|
313
|
|
314 args = parser.parse_args()
|
|
315
|
|
316 # Elucidate filetype and open supplier
|
|
317 ifile = os.path.abspath(args.input)
|
|
318 if not os.path.isfile(ifile):
|
|
319 print "Error: ", ifile, " is not a file or cannot be found."
|
|
320 sys.exit(1)
|
|
321 if not os.path.exists(ifile):
|
|
322 print "Error: ", ifile, " does not exist or cannot be found."
|
|
323 sys.exit(1)
|
|
324 if not os.access(ifile, os.R_OK):
|
|
325 print "Error: ", ifile, " is not readable."
|
|
326 sys.exit(1)
|
|
327
|
|
328 if not args.iformat:
|
|
329 # try to guess the filetype
|
|
330 filetype = check_filetype( ifile )
|
|
331 else:
|
|
332 filetype = args.iformat # sdf or smi
|
|
333
|
|
334
|
|
335 """
|
|
336 We want to store the original SMILES in the output. So in case of a SMILES file iterate over the file and convert each line separate.
|
|
337 """
|
|
338 if filetype == 'sdf':
|
|
339 supplier = Chem.SDMolSupplier( ifile )
|
|
340 # Process file
|
|
341 if args.header:
|
|
342 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\n")
|
|
343 count = 0
|
|
344 for mol in supplier:
|
|
345 count += 1
|
|
346 if mol is None:
|
|
347 print "Warning: skipping molecule ", count, " and continuing with next."
|
|
348 continue
|
|
349 props = properties(mol)
|
|
350
|
|
351 if args.method == 'max':
|
|
352 calc_qed = weights_max(mol, True, props)
|
|
353 elif args.method == 'unweighted':
|
|
354 calc_qed = weights_none(mol, True, props)
|
|
355 else:
|
|
356 calc_qed = weights_mean(mol, True, props)
|
|
357
|
|
358 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\n" % (
|
|
359 props[0],
|
|
360 props[1],
|
|
361 props[2],
|
|
362 props[3],
|
|
363 props[4],
|
|
364 props[5],
|
|
365 props[6],
|
|
366 props[7],
|
|
367 props[8],
|
|
368 calc_qed,
|
|
369 mol.GetProp("_Name"),
|
|
370 ))
|
|
371 elif filetype == 'smi':
|
|
372 supplier = Chem.SmilesMolSupplier( ifile, " \t", 0, 1, False, True )
|
|
373
|
|
374 # Process file
|
|
375 if args.header:
|
|
376 args.outfile.write("MW\tALOGP\tHBA\tHBD\tPSA\tROTB\tAROM\tALERTS\tLRo5\tQED\tNAME\tSMILES\n")
|
|
377 count = 0
|
|
378 for line in open(ifile):
|
|
379 tokens = line.strip().split('\t')
|
|
380 if len(tokens) > 1:
|
|
381 smiles, title = tokens
|
|
382 else:
|
|
383 smiles = tokens[0]
|
|
384 title = ''
|
|
385 mol = Chem.MolFromSmiles(smiles)
|
|
386 count += 1
|
|
387 if mol is None:
|
|
388 print "Warning: skipping molecule ", count, " and continuing with next."
|
|
389 continue
|
|
390 props = properties(mol)
|
|
391
|
|
392 if args.method == 'max':
|
|
393 calc_qed = weights_max(mol, True, props)
|
|
394 elif args.method == 'unweighted':
|
|
395 calc_qed = weights_none(mol, True, props)
|
|
396 else:
|
|
397 calc_qed = weights_mean(mol, True, props)
|
|
398
|
|
399 args.outfile.write( "%.2f\t%.3f\t%d\t%d\t%.2f\t%d\t%d\t%d\t%s\t%.3f\t%-s\t%s\n" % (
|
|
400 props[0],
|
|
401 props[1],
|
|
402 props[2],
|
|
403 props[3],
|
|
404 props[4],
|
|
405 props[5],
|
|
406 props[6],
|
|
407 props[7],
|
|
408 props[8],
|
|
409 calc_qed,
|
|
410 title,
|
|
411 smiles
|
|
412 ))
|
|
413 else:
|
|
414 sys.exit("Error: unknown file-type: ", filetype)
|